if(!is.null(dev.list())) dev.off() # Clear plots
rm(list=ls()) # Clean workspace
calcQ <- function(x) { # Function to calculate the different quartiles
s.x <- summary(x)
iqr<-s.x[5]-s.x[2]
list(souti=s.x[2]-3*iqr, mouti=s.x[2]-1.5*iqr, min=s.x[1], q1=s.x[2], q2=s.x[3],
q3=s.x[5], max=s.x[6], mouts=s.x[5]+1.5*iqr, souts=s.x[5]+3*iqr )
}
countNA <- function(x) { # Function to count the NA values
mis_x <- NULL
for (j in 1:ncol(x)) {mis_x[j] <- sum(is.na(x[,j])) }
mis_x <- as.data.frame(mis_x)
rownames(mis_x) <- names(x)
mis_i <- rep(0,nrow(x))
for (j in 1:ncol(x)) {mis_i <- mis_i + as.numeric(is.na(x[,j])) }
list(mis_col=mis_x,mis_ind=mis_i)
}
countX <- function(x,X) { # Function to count a specific number of appearences
n_x <- NULL
for (j in 1:ncol(x)) {n_x[j] <- sum(x[,j]==X) }
n_x <- as.data.frame(n_x)
rownames(n_x) <- names(x)
nx_i <- rep(0,nrow(x))
for (j in 1:ncol(x)) {nx_i <- nx_i + as.numeric(x[,j]==X) }
list(nx_col=n_x,nx_ind=nx_i)
}
We load the necessary packages and set working directory
setwd("~/Documents/uni/FIB-ADEI-LAB/deliverable4")
filepath<-"~/Documents/uni/FIB-ADEI-LAB/deliverable4"
options(contrasts=c("contr.treatment","contr.treatment"))
requiredPackages <- c("missMDA","chemometrics","mvoutlier","effects","FactoMineR","car","lmtest","ggplot2","moments","factoextra","RColorBrewer","dplyr","ggmap","ggthemes","knitr")
missingPackages <- requiredPackages[!(requiredPackages %in% installed.packages()[,"Package"])]
if(length(missingPackages)) install.packages(missingPackages)
lapply(requiredPackages, require, character.only = TRUE)
From the proposed database, we need to select a sample of 5000 records randomly so we can start analyzing our data.
!!!!! PER DESCOMENTAR AL FINAL
#df<-read.table(paste0(filepath,"/green_tripdata_2016-01.csv"),header=T, sep=",")
#set.seed(180998)
#sam<-as.vector(sort(sample(1:nrow(df),5000)))
#df<-df[sam,]
!!! ESBORRAR AL DFINAL
load(paste0(filepath,"/Taxi5000_raw.RData"))
summary(df)
## VendorID lpep_pickup_datetime Lpep_dropoff_datetime Store_and_fwd_flag
## Min. :1.000 Length:5000 Length:5000 Length:5000
## 1st Qu.:2.000 Class :character Class :character Class :character
## Median :2.000 Mode :character Mode :character Mode :character
## Mean :1.788
## 3rd Qu.:2.000
## Max. :2.000
## RateCodeID Pickup_longitude Pickup_latitude Dropoff_longitude
## Min. :1.0 Min. :-75.39 Min. : 0.00 Min. :-75.31
## 1st Qu.:1.0 1st Qu.:-73.96 1st Qu.:40.70 1st Qu.:-73.97
## Median :1.0 Median :-73.95 Median :40.75 Median :-73.94
## Mean :1.1 Mean :-73.89 Mean :40.72 Mean :-73.80
## 3rd Qu.:1.0 3rd Qu.:-73.92 3rd Qu.:40.80 3rd Qu.:-73.91
## Max. :5.0 Max. : 0.00 Max. :41.04 Max. : 0.00
## Dropoff_latitude Passenger_count Trip_distance Fare_amount
## Min. : 0.00 Min. :0.000 Min. : 0.000 Min. :-52.0
## 1st Qu.:40.70 1st Qu.:1.000 1st Qu.: 1.020 1st Qu.: 6.0
## Median :40.75 Median :1.000 Median : 1.800 Median : 9.0
## Mean :40.67 Mean :1.375 Mean : 2.765 Mean : 11.9
## 3rd Qu.:40.79 3rd Qu.:1.000 3rd Qu.: 3.420 3rd Qu.: 14.5
## Max. :41.18 Max. :6.000 Max. :52.790 Max. :200.0
## Extra MTA_tax Tip_amount Tolls_amount
## Min. :-1.0000 Min. :-0.5000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.0000 1st Qu.: 0.5000 1st Qu.: 0.000 1st Qu.: 0.00000
## Median : 0.5000 Median : 0.5000 Median : 0.000 Median : 0.00000
## Mean : 0.3517 Mean : 0.4857 Mean : 1.217 Mean : 0.08369
## 3rd Qu.: 0.5000 3rd Qu.: 0.5000 3rd Qu.: 2.000 3rd Qu.: 0.00000
## Max. : 1.0000 Max. : 0.5000 Max. :96.000 Max. :18.04000
## Ehail_fee improvement_surcharge Total_amount Payment_type
## Mode:logical Min. :-0.3000 Min. :-52.80 Min. :1.00
## NA's:5000 1st Qu.: 0.3000 1st Qu.: 7.80 1st Qu.:1.00
## Median : 0.3000 Median : 11.16 Median :2.00
## Mean : 0.2914 Mean : 14.33 Mean :1.52
## 3rd Qu.: 0.3000 3rd Qu.: 17.16 3rd Qu.:2.00
## Max. : 0.3000 Max. :260.00 Max. :4.00
## Trip_type
## Min. :1.000
## 1st Qu.:1.000
## Median :1.000
## Mean :1.023
## 3rd Qu.:1.000
## Max. :2.000
names(df)[names(df) == "VendorID"] <- "q.vendor_id"
names(df)[names(df) == "lpep_pickup_datetime"] <- "qual.lpep_pickup_datetime"
names(df)[names(df) == "Lpep_dropoff_datetime"] <- "qual.lpep_dropoff_datetime"
names(df)[names(df) == "Store_and_fwd_flag"] <- "qual.store_and_fwd_flag"
names(df)[names(df) == "RateCodeID"] <- "q.rate_code_id"
names(df)[names(df) == "Pickup_longitude"] <- "q.pickup_longitude"
names(df)[names(df) == "Pickup_latitude"] <- "q.pickup_latitude"
names(df)[names(df) == "Dropoff_longitude"] <- "q.dropoff_longitude"
names(df)[names(df) == "Dropoff_latitude"] <- "q.dropoff_latitude"
names(df)[names(df) == "Passenger_count"] <- "q.passenger_count"
names(df)[names(df) == "Trip_distance"] <- "q.trip_distance"
names(df)[names(df) == "Fare_amount"] <- "q.fare_amount"
names(df)[names(df) == "Extra"] <- "q.extra"
names(df)[names(df) == "MTA_tax"] <- "q.mta_tax"
names(df)[names(df) == "Tip_amount"] <- "q.tip_amount"
names(df)[names(df) == "Tolls_amount"] <- "q.tolls_amount"
df$Ehail_fee <- NULL # deleting it --> only NA's
names(df)[names(df) == "improvement_surcharge"] <- "q.improvement_surcharge"
names(df)[names(df) == "Total_amount"] <- "q.target.total_amount"
names(df)[names(df) == "Payment_type"] <- "q.payment_type"
names(df)[names(df) == "Trip_type"] <- "q.trip_type"
summary(df); names(df)
## q.vendor_id qual.lpep_pickup_datetime qual.lpep_dropoff_datetime
## Min. :1.000 Length:5000 Length:5000
## 1st Qu.:2.000 Class :character Class :character
## Median :2.000 Mode :character Mode :character
## Mean :1.788
## 3rd Qu.:2.000
## Max. :2.000
## qual.store_and_fwd_flag q.rate_code_id q.pickup_longitude q.pickup_latitude
## Length:5000 Min. :1.0 Min. :-75.39 Min. : 0.00
## Class :character 1st Qu.:1.0 1st Qu.:-73.96 1st Qu.:40.70
## Mode :character Median :1.0 Median :-73.95 Median :40.75
## Mean :1.1 Mean :-73.89 Mean :40.72
## 3rd Qu.:1.0 3rd Qu.:-73.92 3rd Qu.:40.80
## Max. :5.0 Max. : 0.00 Max. :41.04
## q.dropoff_longitude q.dropoff_latitude q.passenger_count q.trip_distance
## Min. :-75.31 Min. : 0.00 Min. :0.000 Min. : 0.000
## 1st Qu.:-73.97 1st Qu.:40.70 1st Qu.:1.000 1st Qu.: 1.020
## Median :-73.94 Median :40.75 Median :1.000 Median : 1.800
## Mean :-73.80 Mean :40.67 Mean :1.375 Mean : 2.765
## 3rd Qu.:-73.91 3rd Qu.:40.79 3rd Qu.:1.000 3rd Qu.: 3.420
## Max. : 0.00 Max. :41.18 Max. :6.000 Max. :52.790
## q.fare_amount q.extra q.mta_tax q.tip_amount
## Min. :-52.0 Min. :-1.0000 Min. :-0.5000 Min. : 0.000
## 1st Qu.: 6.0 1st Qu.: 0.0000 1st Qu.: 0.5000 1st Qu.: 0.000
## Median : 9.0 Median : 0.5000 Median : 0.5000 Median : 0.000
## Mean : 11.9 Mean : 0.3517 Mean : 0.4857 Mean : 1.217
## 3rd Qu.: 14.5 3rd Qu.: 0.5000 3rd Qu.: 0.5000 3rd Qu.: 2.000
## Max. :200.0 Max. : 1.0000 Max. : 0.5000 Max. :96.000
## q.tolls_amount q.improvement_surcharge q.target.total_amount
## Min. : 0.00000 Min. :-0.3000 Min. :-52.80
## 1st Qu.: 0.00000 1st Qu.: 0.3000 1st Qu.: 7.80
## Median : 0.00000 Median : 0.3000 Median : 11.16
## Mean : 0.08369 Mean : 0.2914 Mean : 14.33
## 3rd Qu.: 0.00000 3rd Qu.: 0.3000 3rd Qu.: 17.16
## Max. :18.04000 Max. : 0.3000 Max. :260.00
## q.payment_type q.trip_type
## Min. :1.00 Min. :1.000
## 1st Qu.:1.00 1st Qu.:1.000
## Median :2.00 Median :1.000
## Mean :1.52 Mean :1.023
## 3rd Qu.:2.00 3rd Qu.:1.000
## Max. :4.00 Max. :2.000
## [1] "q.vendor_id" "qual.lpep_pickup_datetime"
## [3] "qual.lpep_dropoff_datetime" "qual.store_and_fwd_flag"
## [5] "q.rate_code_id" "q.pickup_longitude"
## [7] "q.pickup_latitude" "q.dropoff_longitude"
## [9] "q.dropoff_latitude" "q.passenger_count"
## [11] "q.trip_distance" "q.fare_amount"
## [13] "q.extra" "q.mta_tax"
## [15] "q.tip_amount" "q.tolls_amount"
## [17] "q.improvement_surcharge" "q.target.total_amount"
## [19] "q.payment_type" "q.trip_type"
Initialization of counts for missings, outliers and errors. All numerical variables have to be checked before
imis<-rep(0,nrow(df)); mis1<-countNA(df); imis<-mis1$mis_ind
jmis<-rep(0,2*ncol(df))
iouts<-rep(0,nrow(df))
jouts<-rep(0,2*ncol(df))
ierrs<-rep(0,nrow(df))
jerrs<-rep(0,2*ncol(df))
Description: Original numeric variables corresponding to qualitative concepts have to be converted to factors. New factors grouping original levels will be considered very positively.
We need to do an analysis of all the variables to be able to identify missings, errors and outliers. We will also try to factorize each variable to make it easier to understand the sample.
df$q.hour<-as.numeric(substr(strptime(df$qual.lpep_pickup_datetime, "%Y-%m-%d %H:%M:%S"),12,13))
df$f.period<-1
df$f.period[df$q.hour>7]<-2
df$f.period[df$q.hour>10]<-3
df$f.period[df$q.hour>16]<-4
df$f.period[df$q.hour>20]<-1
df$f.period<-factor(df$f.period,labels=paste("period",c("night","morning","valley","afternoon")))
barplot(summary(df$f.period),main="period barplot",col="darkslateblue")
This variable expresses the Creative Mobile Technologies, LLC as 1 and Verifone Inc as 2, so we create a factor to make it more readable. With the initial summary we see that this variable does not have any missing value, so we proceed to factor it.
names(df)[names(df) == "q.vendor_id"] <- "f.vendor_id"
df$f.vendor_id<-factor(df$f.vendor_id,labels=c("vendor_id_mobile","vendor_id_verifone"))
barplot(summary(df$f.vendor_id),main="vendor_id barplot",col="darkslateblue")
This variable expresses the different RateCodeIDs that we can have as numerical values, so we need to categorize them in order to be able to work with them.
names(df)[names(df) == "q.rate_code_id"] <- "f.rate_code_id"
df$f.rate_code_id<-factor(df$f.rate_code_id)
barplot(summary(df$f.rate_code_id),main="rate_code_id barplot",col="darkslateblue")
We see that most samples are in RateCodeID = 1, which is what we are interested in. Therefore, we factorize and create only two groups, the one with RateCodeID = 1 and the rest.
df$f.rate_code_id[df$f.rate_code_id != 1] = 2
df$f.rate_code_id <- factor(df$f.rate_code_id, labels=c("rate_code_id_1","rate_code_id_other"))
barplot(summary(df$f.rate_code_id),main="new rate_code_id barplot",col="darkslateblue")
Now is more balanced.
This is a categorical variable with the values Y and N, so we need to factor it.
names(df)[names(df) == "qual.store_and_fwd_flag"] <- "f.store_and_fwd_flag"
df$f.store_and_fwd_flag<-factor(df$f.store_and_fwd_flag, labels=c("flag-no","flag-yes"))
summary(df$f.store_and_fwd_flag)
## flag-no flag-yes
## 4982 18
This variable is categorical but it is expressed as numerical, so we need to factor it in order to be able to work with it.
names(df)[names(df) == "q.payment_type"] <- "f.payment_type"
df$f.payment_type<-factor(df$f.payment_type,labels=c("credit card","cash","no charge","dispute"))
barplot(summary(df$f.payment_type),main="payment_type barplot",col="darkslateblue")
As we can see, there are few values with “No charge” or “Dispute” category, so we decided to categorize it into a new category (“No paid”).
levels(df$f.payment_type) <- c("credit card","cash","no paid","no paid")
barplot(summary(df$f.payment_type),main="new payment_type barplot",col="darkslateblue")
Now is more balanced.
This variable is categorical but it is expressed as numerical, so we need to factor it in order to be able to work with it.
names(df)[names(df) == "q.trip_type"] <- "f.trip_type"
df$f.trip_type<-factor(df$f.trip_type,labels=c("trip_street_hail","trip_dispatch"))
barplot(summary(df$f.trip_type),main="trip_type barplot",col="darkslateblue")
Description: Original numeric variables corresponding to real quantitative concepts are kept as numeric but additional factors should also be created as a discretization of each numeric variable.
We only keep the hours (variables 2 and 3) to be able to work with time slots in the future.
Create new variables derived from the original ones, as effective speed, travel time, hour of request, period of request, effective trip distance (in km)
df$q.tlenkm<-df$q.trip_distance*1.609344 # Miles to km
df$q.travel_time<-(as.numeric(as.POSIXct(df$qual.lpep_dropoff_datetime)) - as.numeric(as.POSIXct(df$qual.lpep_pickup_datetime)))/60
df$q.espeed<-(df$q.tlenkm/(df$q.travel_time))*60
sel<-which(is.na(df$q.espeed<=0))
imis[sel]<-imis[sel]+1
jmis[25]<-length(sel)
We detect as error those speeds smaller than 0 and bigger than 200
summary(df$q.espeed)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 14.60 18.58 23.07 23.70 3881.74 2
sel<-which((df$q.espeed<=0)|(df$q.espeed > 200))
ierrs[sel]<-ierrs[sel]+1
jerrs[25]<-length(sel)
Sel contains the rownames of the individuals with “0” as value for longitude
df[sel,"q.espeed"]<-NA
Boxplot(df$q.espeed)
## [1] 4780 3001 3066 1936 120 3578 1767 4824 2685 3009 2647 2804 2546 3865 1702
## [16] 4995 1354 3849 132 2075
var_out<-calcQ(df$q.espeed)
abline(h=var_out$souts,col="red")
abline(h=var_out$souti,col="red")
llout<-which((df$q.espeed<=3)|(df$q.espeed>80))
iouts[llout]<-iouts[llout]+1
jouts[25]<-length(llout)
df[llout,"q.espeed"]<-NA
We just keep the hours
df$qual.pickup<-substr(strptime(df$qual.lpep_pickup_datetime, "%Y-%m-%d %H:%M:%S"), 12, 13)
We just keep the hours
df$qual.dropoff<-substr(strptime(df$qual.lpep_dropoff_datetime, "%Y-%m-%d %H:%M:%S"), 12, 13)
summary(df$q.passenger_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 1.000 1.375 1.000 6.000
We set the 0 as an error because it is not possible to have a trip without passengers
sel<-which(df$q.passenger_count == 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[10]<-length(sel)
Sel contains the rownames of the individuals with “0” as value for passengers
df[sel,"q.passenger_count"]<-NA
summary(df$q.trip_distance)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.020 1.800 2.765 3.420 52.790
We see on the summary that there are not NA values, so we proceed to the outlier and error detection.
In order to evalute or data, we decide to set the maximum trip distance to 30, so we proceed to delete the outliers.
Boxplot(df$q.trip_distance)
## [1] 2680 4072 1702 2075 723 3107 2691 1105 4301 3154
var_out<-calcQ(df$q.trip_distance)
abline(h=var_out$souts,col="red")
abline(h=var_out$souti,col="red")
abline(h=30,col="blue",lwd=2)
llout<-which(df$q.trip_distance>30)
iouts[llout]<-iouts[llout]+1
jouts[11]<-length(llout)
We decide that an incorrect trip distance is the one with 0 miles or less. In order to be aware of this error we store it at ierrs, and jerrs. ierrs stores the number of errors in a row, and jerrs stores the total amount of errors in a variable.
sel<-which(df$q.trip_distance <= 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[11]<-length(sel)
Now, we set NA values in order to remove errors and outliersfrom the dataset
setNA<-which((df$q.trip_distance<=0) | (df$q.trip_distance > 30))
df[setNA,"q.trip_distance"]<-NA
We are going to set a categorical variable for the f.trip_distance_range We decided to create 3 levels: “trip_dist_short”, “trip_dist_medium” and“trip_dist_long”. - trip_dist_short: <= 2.5 - trip_dist_medium: 2.5 < q.trip_distance <= 5 - trip_dist_long: > 5
df$f.trip_distance_range[df$q.trip_distance <= 2.5] = "trip_dist_short"
df$f.trip_distance_range[(df$q.trip_distance > 2.5) & (df$q.trip_distance <= 5)] = "trip_dist_medium"
df$f.trip_distance_range[df$q.trip_distance > 5] = "trip_dist_long"
df$f.trip_distance_range <- factor(df$f.trip_distance_range)
We see a barplot for the factor we created.
barplot(table(df$f.trip_distance_range),main="trip_distance_range Barplot",col="darkslateblue")
We know that New York’s longitude is -73.9385, so values that differ a lot from this value is an error or an outlier.
summary(df$q.pickup_longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -75.39 -73.96 -73.95 -73.89 -73.92 0.00
0.00 looks to be an error Seeing the individuals with this “0” value: df[which(df[,“q.pickup_longitude”]==0),] it is a quantitive variable. Non-possible values will be recoded as errors, so will be transformed to NA.
sel<-which(df$q.pickup_longitude == 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[6]<-length(sel)
df[sel,"q.pickup_longitude"]<-NA
Non-possible values are replaced by NA, missing value symbol in R.
We are deleting trips from outside New York. This means we are not using longitudes bigger than -73.80 and smaller than -74.02.
llout <-which((df$q.pickup_longitude < -74.02) | (df$q.pickup_longitude > -73.80))
iouts[llout]<-iouts[llout]+1
jouts[6]<-length(llout)
df[llout,"q.pickup_longitude"]<-NA
We know that New York’s latitude is 40.6643, so values that differ a lot from this value is an error or an outlier.
summary(df$q.pickup_latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 40.70 40.75 40.72 40.80 41.04
0.00 looks to be an error. Seeing the individuals with this “0” value: df[which(df[,“q.pickup_latitude”]==0),] it is a quantitive variable. non-possible values will be recoded as errors, so will be transformed to NA.
sel<-which(df$q.pickup_latitude == 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[7]<-length(sel)
df[sel,"q.pickup_latitude"]<-NA
Non-possible values are replaced by NA, missing value symbol in R. We are deleting trips from outside New York. This means we are not using latitudes bigger than 40.54 and smallerthan 40.86
llout <-which((df$q.pickup_latitude < 40.54) | (df$q.pickup_latitude > 40.86))
iouts[llout]<-iouts[llout]+1
jouts[7]<-length(llout)
df[llout,"q.pickup_latitude"]<-NA
We know that New York’s longitude is -73.9385, so values that differ a lot from this value is an error or an outlier.
summary(df$q.dropoff_longitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -75.31 -73.97 -73.94 -73.80 -73.91 0.00
0.00 looks to be an error Seeing the individuals with this “0” value: df[which(df[,“q.dropoff_longitude”]==0),] it is a quantitive variable.
Non-possible values will be recoded as errors, so will be transformed to NA.
sel<-which(df$q.dropoff_longitude == 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[8]<-length(sel)
df[sel,"q.dropoff_longitude"]<-NA
Non-possible values are replaced by NA, missing value symbol in R. We are deleting trips from outside New York. This means we are not using longitudes bigger than -73.80 and smaller than -74.02.
llout <-which((df$q.dropoff_longitude < -74.02) | (df$q.dropoff_longitude > -73.80))
iouts[llout]<-iouts[llout]+1
jouts[8]<-length(llout)
df[llout,"q.dropoff_longitude"]<-NA
We know that New York’s latitude is 40.6643, so values that differ a lot from this value is an error or an outlier.
summary(df$q.dropoff_latitude)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 40.70 40.75 40.67 40.79 41.18
0.00 looks to be an error Seeing the individuals with this “0” value: df[which(df[,“q.dropoff_latitude”]==0),] it is a quantitive variable. Non-possible values will be recoded as errors, so will be transformed to NA.
sel<-which(df$q.dropoff_latitude == 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[9]<-length(sel)
Sel contains the rownames of the individuals with “0” as value for longitude
df[sel,"q.dropoff_latitude"]<-NA
Non-possible values are replaced by NA, missing value symbol in R. We are deleting trips from outside New York. This means we are not using latitude bigger than 40.54 and smaller than 40.86
llout <-which((df$q.dropoff_latitude < 40.54) | (df$q.dropoff_latitude > 40.86))
iouts[llout]<-iouts[llout]+1
jouts[9]<-length(llout)
Now that we have the outliers, we are setting them as NA
df[llout,"q.dropoff_latitude"]<-NA
We know that the fare should be positive, as it is the price of the trip, so we’ll treat as error those values. The next we’ll do is decide the outliers.
summary(df$q.fare_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -52.0 6.0 9.0 11.9 14.5 200.0
sel<-which(df$q.fare_amount <= 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[12]<-length(sel)
df[sel,"q.fare_amount"]<-NA
Non-possible values are replaced by NA, missing value symbol in R
Boxplot(df$q.fare_amount)
## [1] 633 634 2680 4072 1702 4284 2075 2560 3755 723
var_out<-calcQ(df$q.fare_amount)
abline(h=var_out$souts,col="red")
abline(h=var_out$souti,col="red")
abline(h=60,col="blue",lwd=2)
We decide to set outliers for fare amounts bigger than 60, because the majority of the values are concentrated between 0 and 60.
llout<-which(df$q.fare_amount>60)
iouts[llout]<-iouts[llout]+1
jouts[12]<-length(llout)
df[llout,"q.fare_amount"]<-NA
As this variable is price related, it cannot have negative values, so this individuals will be treated as errors.
table(df$q.extra)
##
## -1 -0.5 0 0.5 1
## 2 5 2296 1868 829
As it is a price related variable, negative values should be treated as errors, and the other values are the ones defined for this variable, so there are not outliers.
sel<-which(df$q.extra < 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[13]<-length(sel)
df[sel,"q.extra"]<-NA
This variable corresponds to a tax that must be charged in every trip and its cost is $0.50, so values different from this are errors, and we don’t have to take into account outliers because after the errors detection all values should be the MTA_tax.
table(df$q.mta_tax)
##
## -0.5 0 0.5
## 10 123 4867
Important note: We assume that when this tax is smaller than 0, it is an error. If tax is 0, we say that payment in these cases is equivalent to “no paid”.
sel<-which(df$q.mta_tax < 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[14]<-length(sel)
df[sel,"q.mta_tax"]<-NA
This variable corresponds to a charge that must be charged in every trip and its cost is $0.30, so values smaller than 0 are errors, and we don’t have to take into account outliers because after the errors detection all values should be the Improvement surcharge.
table(df$q.improvement_surcharge)
##
## -0.3 0 0.3
## 11 121 4868
We see that the 0 individuals are errors.
sel<-which(df$q.improvement_surcharge < 0)
ierrs[sel]<-ierrs[sel]+1
jerrs[17]<-length(sel)
df[sel,"q.improvement_surcharge"]<-NA
As this is a price related variable, negative values should be considered as errors, and big tips should be considered as outliers. Also tip amounts bigger than 0 for individuals with payment_type = “Cash” should be considered as errors as well.
summary(df$q.tip_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.217 2.000 96.000
We proceed to check if the 0 values are related with payment_type = “credit card” and the passenger did not tip.
table(df$q.tip_amount>0, df$f.payment_type)
##
## credit card cash no paid
## FALSE 357 2506 39
## TRUE 2098 0 0
Now, we proceed to the outlier detection.
Boxplot(df$q.tip_amount)
## [1] 4181 633 634 3295 4918 2194 1702 46 1433 2075
var_out<-calcQ(df$q.tip_amount)
abline(h=var_out$souts,col="red")
abline(h=var_out$souti,col="red")
abline(h=40,col="blue",lwd=2)
llout<-which(df$q.tip_amount>40)
iouts[llout]<-iouts[llout]+1
jouts[15]<-length(llout)
df[llout,"q.tip_amount"]<-NA
As this is a price related variable, negative values should be considered as errors.
table(df$q.tolls_amount)
##
## 0 2.54 5.54 8 10.5 11.08 11.75 18.04
## 4931 2 60 1 2 2 1 1
We see that there are not negative values, so we do not have errors. We proceed now to the outlier detection.
Boxplot(df$q.tolls_amount)
## [1] 2194 2560 3040 3289 415 2864 2474 122 347 379
var_out<-calcQ(df$q.tolls_amount)
abline(h=var_out$souts,col="red")
abline(h=var_out$souti,col="red")
As we see in the boxplot and the table, the majority of the individuals are 0, so the values bigger than 5.54 will be outliers.
llout<-which(df$q.tolls_amount>5.54)
iouts[llout]<-iouts[llout]+1
jouts[16]<-length(llout)
df[llout,"q.tolls_amount"]<-NA
This is a price related variable, so negative values should be treated as errors. Also, we need to sum the “q.fare_amount”, “q.extra”,“q.mta_tax”, “q.improvement_surcharge”, “q.tip_amount” and the “q.tolls_amount” in order to see if the q.target.total_amount matches with this sum.
summary(df$q.target.total_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -52.80 7.80 11.16 14.33 17.16 260.00
Negative values seem to be errors - 0 Total_amount is possible when Payment_type ==“No charge”
We proceed to check if total amount is correctsumming the other variables and checking negatives values:
sum_total_amount = (
df$q.fare_amount +
df$q.extra +
df$q.mta_tax +
df$q.improvement_surcharge +
df$q.tip_amount +
df$q.tolls_amount
)
sel<-which((df$q.target.total_amount != sum_total_amount) | (df$q.target.total_amount<0))
if (length(sel)>0) {
ierrs[sel]<-ierrs[sel]+1
jerrs[18]<-length(sel)
}
df[sel,"q.target.total_amount"]<-NA
Boxplot(df$q.target.total_amount)
## [1] 633 634 2680 1702 4072 2194 2075 4181 4284 2560
var_out<-calcQ(df$q.target.total_amount)
abline(h=var_out$souts,col="red")
abline(h=var_out$souti,col="red")
abline(h=150,col="blue",lwd=2)
llout<-which(df$q.target.total_amount>150)
iouts[llout]<-iouts[llout]+1
jouts[18]<-length(llout)
df[llout,"q.target.total_amount"]<-NA
Per each variable, we have to count the following:
missings_ranking_sortlist <- sort.list(mis1$mis_col, decreasing = TRUE)
for (i in missings_ranking_sortlist) {
print(paste(names(df)[i], " : ", mis1$mis_col$mis_x[i]))
}
## [1] "f.vendor_id : 0"
## [1] "qual.lpep_pickup_datetime : 0"
## [1] "qual.lpep_dropoff_datetime : 0"
## [1] "f.store_and_fwd_flag : 0"
## [1] "f.rate_code_id : 0"
## [1] "q.pickup_longitude : 0"
## [1] "q.pickup_latitude : 0"
## [1] "q.dropoff_longitude : 0"
## [1] "q.dropoff_latitude : 0"
## [1] "q.passenger_count : 0"
## [1] "q.trip_distance : 0"
## [1] "q.fare_amount : 0"
## [1] "q.extra : 0"
## [1] "q.mta_tax : 0"
## [1] "q.tip_amount : 0"
## [1] "q.tolls_amount : 0"
## [1] "q.improvement_surcharge : 0"
## [1] "q.target.total_amount : 0"
## [1] "f.payment_type : 0"
## [1] "f.trip_type : 0"
errors_ranking_sortlist <- sort.list(jerrs, decreasing = TRUE)
for (i in errors_ranking_sortlist) {
if(!is.na(names(df)[i])) { print(paste(names(df)[i], " : ", jerrs[i])) }
}
## [1] "q.target.total_amount : 374"
## [1] "q.espeed : 73"
## [1] "q.trip_distance : 66"
## [1] "q.fare_amount : 24"
## [1] "q.improvement_surcharge : 11"
## [1] "q.mta_tax : 10"
## [1] "q.dropoff_longitude : 9"
## [1] "q.dropoff_latitude : 9"
## [1] "q.extra : 7"
## [1] "q.pickup_longitude : 3"
## [1] "q.pickup_latitude : 3"
## [1] "q.passenger_count : 2"
## [1] "f.vendor_id : 0"
## [1] "qual.lpep_pickup_datetime : 0"
## [1] "qual.lpep_dropoff_datetime : 0"
## [1] "f.store_and_fwd_flag : 0"
## [1] "f.rate_code_id : 0"
## [1] "q.tip_amount : 0"
## [1] "q.tolls_amount : 0"
## [1] "f.payment_type : 0"
## [1] "f.trip_type : 0"
## [1] "q.hour : 0"
## [1] "f.period : 0"
## [1] "q.tlenkm : 0"
## [1] "q.travel_time : 0"
## [1] "qual.pickup : 0"
## [1] "qual.dropoff : 0"
## [1] "f.trip_distance_range : 0"
errors_ranking_sortlist <- sort.list(jouts, decreasing = TRUE)
for (i in errors_ranking_sortlist) {
if(!is.na(names(df)[i])) print(paste(names(df)[i], " : ", jouts[i]))
}
## [1] "q.dropoff_latitude : 116"
## [1] "q.dropoff_longitude : 113"
## [1] "q.pickup_latitude : 84"
## [1] "q.espeed : 39"
## [1] "q.fare_amount : 20"
## [1] "q.pickup_longitude : 19"
## [1] "q.tolls_amount : 7"
## [1] "q.trip_distance : 4"
## [1] "q.tip_amount : 4"
## [1] "q.target.total_amount : 3"
## [1] "f.vendor_id : 0"
## [1] "qual.lpep_pickup_datetime : 0"
## [1] "qual.lpep_dropoff_datetime : 0"
## [1] "f.store_and_fwd_flag : 0"
## [1] "f.rate_code_id : 0"
## [1] "q.passenger_count : 0"
## [1] "q.extra : 0"
## [1] "q.mta_tax : 0"
## [1] "q.improvement_surcharge : 0"
## [1] "f.payment_type : 0"
## [1] "f.trip_type : 0"
## [1] "q.hour : 0"
## [1] "f.period : 0"
## [1] "q.tlenkm : 0"
## [1] "q.travel_time : 0"
## [1] "qual.pickup : 0"
## [1] "qual.dropoff : 0"
## [1] "f.trip_distance_range : 0"
Per each individuals, we have to count the following:
barplot(table(imis),main="missings per individual barplot",col="darkslateblue")
We see that there are no native missing values (remember we deleted Ehail_fee).
As we can see, most individuals have no mistakes.
barplot(table(ierrs),main="errors per individual earplot",col="darkslateblue")
barplot(table(iouts),main="Outliers per individual Barplot",col="darkslateblue")
total_missings <- 0; total_outliers <- 0; total_errors <- 0;
for (m in imis) {total_missings <- total_missings + m}
for (o in iouts) {total_outliers <- total_outliers + o}
for (e in ierrs) {total_errors <- total_errors + e}
Now, let’s print this variables:
total_missings
## [1] 2
total_outliers
## [1] 409
total_errors
## [1] 591
| ## Delete some unecessary variables |
r df$qual.lpep_pickup_datetime <- NULL df$qual.lpep_dropoff_datetime <- NULL names(df) |
## [1] "f.vendor_id" "f.store_and_fwd_flag" ## [3] "f.rate_code_id" "q.pickup_longitude" ## [5] "q.pickup_latitude" "q.dropoff_longitude" ## [7] "q.dropoff_latitude" "q.passenger_count" ## [9] "q.trip_distance" "q.fare_amount" ## [11] "q.extra" "q.mta_tax" ## [13] "q.tip_amount" "q.tolls_amount" ## [15] "q.improvement_surcharge" "q.target.total_amount" ## [17] "f.payment_type" "f.trip_type" ## [19] "q.hour" "f.period" ## [21] "q.tlenkm" "q.travel_time" ## [23] "q.espeed" "qual.pickup" ## [25] "qual.dropoff" "f.trip_distance_range" |
library(missMDA)
What we do with imputation is be able to eliminate all those values that may be missings, outliers or errors to turn them into values that can be realistic within our sample.
We will now do the study by variables and try to impute the necessary observations.
Note: we do not include MTA_tax (14) nor improvement_surcharge(18). We proceed to delete NA values from Total_amount because it is our target variable, so we do not impute it, but we need to have this variable without NAs.
df <- df[!is.na(df$q.target.total_amount),]
names(df)
## [1] "f.vendor_id" "f.store_and_fwd_flag"
## [3] "f.rate_code_id" "q.pickup_longitude"
## [5] "q.pickup_latitude" "q.dropoff_longitude"
## [7] "q.dropoff_latitude" "q.passenger_count"
## [9] "q.trip_distance" "q.fare_amount"
## [11] "q.extra" "q.mta_tax"
## [13] "q.tip_amount" "q.tolls_amount"
## [15] "q.improvement_surcharge" "q.target.total_amount"
## [17] "f.payment_type" "f.trip_type"
## [19] "q.hour" "f.period"
## [21] "q.tlenkm" "q.travel_time"
## [23] "q.espeed" "qual.pickup"
## [25] "qual.dropoff" "f.trip_distance_range"
vars_quantitatives <- names(df)[c(4,5,6,7,8,9,10,11,12,13,14,15,16,21,22,23)]
# [1] "q.pickup_longitude" "q.pickup_latitude"
# [3] "q.dropoff_longitude" "q.dropoff_latitude"
# [5] "q.passenger_count" "q.trip_distance"
# [7] "q.fare_amount" "q.extra"
# [9] "q.mta_tax" "q.tip_amount"
# [11] "q.tolls_amount" "q.improvement_surcharge"
# [13] "q.target.total_amount" "q.tlenkm"
# [15] "q.travel_time" "q.espeed"
summary(df[,vars_quantitatives])
## q.pickup_longitude q.pickup_latitude q.dropoff_longitude q.dropoff_latitude
## Min. :-74.02 Min. :40.58 Min. :-74.02 Min. :40.58
## 1st Qu.:-73.96 1st Qu.:40.70 1st Qu.:-73.97 1st Qu.:40.70
## Median :-73.94 Median :40.75 Median :-73.94 Median :40.75
## Mean :-73.93 Mean :40.75 Mean :-73.94 Mean :40.74
## 3rd Qu.:-73.92 3rd Qu.:40.80 3rd Qu.:-73.91 3rd Qu.:40.79
## Max. :-73.80 Max. :40.86 Max. :-73.80 Max. :40.86
## NA's :20 NA's :81 NA's :110 NA's :119
## q.passenger_count q.trip_distance q.fare_amount q.extra
## Min. :1.000 Min. : 0.010 Min. : 1.00 Min. :0.0000
## 1st Qu.:1.000 1st Qu.: 1.020 1st Qu.: 6.00 1st Qu.:0.0000
## Median :1.000 Median : 1.760 Median : 9.00 Median :0.5000
## Mean :1.371 Mean : 2.719 Mean :11.47 Mean :0.3523
## 3rd Qu.:1.000 3rd Qu.: 3.420 3rd Qu.:14.50 3rd Qu.:0.5000
## Max. :6.000 Max. :27.000 Max. :60.00 Max. :1.0000
## NA's :2 NA's :62 NA's :30
## q.mta_tax q.tip_amount q.tolls_amount q.improvement_surcharge
## Min. :0.0000 Min. : 0.000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.5000 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.:0.3000
## Median :0.5000 Median : 0.000 Median :0.00000 Median :0.3000
## Mean :0.4871 Mean : 1.029 Mean :0.04671 Mean :0.2923
## 3rd Qu.:0.5000 3rd Qu.: 1.700 3rd Qu.:0.00000 3rd Qu.:0.3000
## Max. :0.5000 Max. :30.000 Max. :5.54000 Max. :0.3000
## NA's :2 NA's :7
## q.target.total_amount q.tlenkm q.travel_time q.espeed
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 3.239
## 1st Qu.: 7.80 1st Qu.: 1.609 1st Qu.: 5.767 1st Qu.:14.826
## Median : 10.80 Median : 2.800 Median : 9.550 Median :18.613
## Mean : 13.93 Mean : 4.358 Mean : 19.863 Mean :20.490
## 3rd Qu.: 17.00 3rd Qu.: 5.472 3rd Qu.: 16.125 3rd Qu.:23.647
## Max. :128.76 Max. :69.314 Max. :1438.183 Max. :75.657
## NA's :105
res.imputation<-imputePCA(df[,vars_quantitatives],ncp=5)
summary(res.imputation$completeObs)
## q.pickup_longitude q.pickup_latitude q.dropoff_longitude q.dropoff_latitude
## Min. :-74.05 Min. :40.58 Min. :-74.06 Min. :40.58
## 1st Qu.:-73.96 1st Qu.:40.70 1st Qu.:-73.97 1st Qu.:40.70
## Median :-73.94 Median :40.75 Median :-73.94 Median :40.75
## Mean :-73.93 Mean :40.75 Mean :-73.94 Mean :40.74
## 3rd Qu.:-73.92 3rd Qu.:40.80 3rd Qu.:-73.91 3rd Qu.:40.79
## Max. :-73.80 Max. :40.86 Max. :-73.80 Max. :40.86
## q.passenger_count q.trip_distance q.fare_amount q.extra
## Min. :1.000 Min. : 0.010 Min. : 1.00 Min. :0.0000
## 1st Qu.:1.000 1st Qu.: 1.020 1st Qu.: 6.00 1st Qu.:0.0000
## Median :1.000 Median : 1.770 Median : 9.00 Median :0.5000
## Mean :1.371 Mean : 2.737 Mean :11.65 Mean :0.3523
## 3rd Qu.:1.000 3rd Qu.: 3.430 3rd Qu.:14.50 3rd Qu.:0.5000
## Max. :6.000 Max. :32.462 Max. :99.58 Max. :1.0000
## q.mta_tax q.tip_amount q.tolls_amount q.improvement_surcharge
## Min. :0.0000 Min. : 0.000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.5000 1st Qu.: 0.000 1st Qu.:0.00000 1st Qu.:0.3000
## Median :0.5000 Median : 0.000 Median :0.00000 Median :0.3000
## Mean :0.4871 Mean : 1.029 Mean :0.04761 Mean :0.2923
## 3rd Qu.:0.5000 3rd Qu.: 1.700 3rd Qu.:0.00000 3rd Qu.:0.3000
## Max. :0.5000 Max. :30.000 Max. :5.54000 Max. :0.3000
## q.target.total_amount q.tlenkm q.travel_time q.espeed
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. :-46.26
## 1st Qu.: 7.80 1st Qu.: 1.609 1st Qu.: 5.767 1st Qu.: 14.81
## Median : 10.80 Median : 2.800 Median : 9.550 Median : 18.60
## Mean : 13.93 Mean : 4.358 Mean : 19.863 Mean : 20.20
## 3rd Qu.: 17.00 3rd Qu.: 5.472 3rd Qu.: 16.125 3rd Qu.: 23.64
## Max. :128.76 Max. :69.314 Max. :1438.183 Max. : 77.48
We proceed now to fix all the numeric variables that have errors or outliers:
summary(res.imputation$completeObs[,"q.pickup_longitude"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -74.05 -73.96 -73.94 -73.93 -73.92 -73.80
summary(res.imputation$completeObs[,"q.pickup_latitude"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.58 40.70 40.75 40.75 40.80 40.86
summary(res.imputation$completeObs[,"q.dropoff_longitude"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -74.06 -73.97 -73.94 -73.94 -73.91 -73.80
summary(res.imputation$completeObs[,"q.dropoff_latitude"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.58 40.70 40.75 40.74 40.79 40.86
We decided to create categorical for this variable so we categorize it for single passengers, couple and groups (3 or more)
df$f.passenger_groups[res.imputation$completeObs[,"q.passenger_count"] == 1] = "passenger_single"
df$f.passenger_groups[res.imputation$completeObs[,"q.passenger_count"] > 1 & res.imputation$completeObs[,"q.passenger_count"] <= 2] = "passenger_couple"
df$f.passenger_groups[res.imputation$completeObs[,"q.passenger_count"] >= 3] = "passenger_group"
df$f.passenger_groups <- factor(df$f.passenger_groups)
We see the barplot in order to see the distribution of passenger per trip
barplot(table(df$f.passenger_groups),main="passenger_groups barplot",col="darkslateblue")
ll<-which(res.imputation$completeObs[,"q.trip_distance"] < 0)
res.imputation$completeObs[ll,"q.trip_distance"] <- 1
ll<-which(res.imputation$completeObs[,"q.trip_distance"] > 30)
res.imputation$completeObs[ll,"q.trip_distance"] <- 30
ll<-which(res.imputation$completeObs[,"q.fare_amount"] > 60)
res.imputation$completeObs[ll,"q.fare_amount"] <- 60
If we execute a table, we’ll see that we have 0, 0’5 and 1 values, so we proceed to categorize this variable to see if has extra or not.
table(df$q.extra)
##
## 0 0.5 1
## 2128 1733 762
df$f.extra[df$q.extra == 0] = 0
df$f.extra[df$q.extra > 0] = 1
df$f.extra<-factor(df$f.extra, labels=c("extra_no","extra_yes"))
We see the barplot in order to see the distribution.
barplot(table(df$f.extra),main="extra barplot",col="darkslateblue")
#### q.mta_tax If we execute a summary, we’ll see that every value should be 0.5 or 0, so we proceed to categorize this variable in order to see if the tax has been paid or not.
table(df$q.mta_tax)
##
## 0 0.5
## 119 4504
df$f.mta_tax<-factor(df$q.mta_tax, labels =c("mta_no","mta_yes"))
We see the barplot in order to see the distribution.
barplot(table(df$q.mta_tax),main="mta_tax barplot",col="darkslateblue")
#### q.tip_amount
ll<-which(res.imputation$completeObs[,"q.tip_amount"] > 17)
res.imputation$completeObs[ll,"q.tip_amount"] <- 17
We see that we have correct data, so we proceed to create the binary factor TipIsGiven.
df$f.target.tip_is_given[(res.imputation$completeObs[,"q.tip_amount"] > 0)] = "tip_yes"
df$f.target.tip_is_given[(res.imputation$completeObs[,"q.tip_amount"] == 0)] = "tip_no"
df$f.target.tip_is_given <- factor(df$f.target.tip_is_given)
summary(df$f.target.tip_is_given)
## tip_no tip_yes
## 2882 1741
As we checked before the imputation and detected as errors those individuals with negative amount, the negative values found now are going to be set as 0 because they result negative during the imputation. After treating this values, we proceed to categorize this variable to see if an individual has paid or not for a toll.
ll<-which(res.imputation$completeObs[,"q.tolls_amount"] < 0)
res.imputation$completeObs[ll,"q.tolls_amount"] <- 0
df$f.paid_tolls[res.imputation$completeObs[,"q.tolls_amount"] == 0] = "tolls_no"
df$f.paid_tolls[res.imputation$completeObs[,"q.tolls_amount"] > 0] = "tolls_yes"
df$f.paid_tolls <- factor(df$f.paid_tolls)
If we execute a table, we’ll see that every value should be 0.3 or 0, so we proceed to categorize this variable in order to see if the surcharge has been paid or not.
table(df$q.improvement_surcharge)
##
## 0 0.3
## 118 4505
df$f.improvement_surcharge<-factor(df$q.improvement_surcharge, labels=c("improvement_no","improvement_yes"))
We see the barplot in order to see the distribution.
barplot(table(df$f.improvement_surcharge),main="improvement_surcharge barplot",col="darkslateblue")
ll<-which(res.imputation$completeObs[,"q.tlenkm"] <= 1)
res.imputation$completeObs[ll,"q.tlenkm"] <- 1
ll<-which(res.imputation$completeObs[,"q.tlenkm"] > 48.28)
res.imputation$completeObs[ll,"q.tlenkm"] <- 48.28
ll<-which(res.imputation$completeObs[,"q.tlenkm"] > 60)
res.imputation$completeObs[ll,"q.tlenkm"] <- 60
ll<-which(res.imputation$completeObs[,"q.espeed"] < 3)
res.imputation$completeObs[ll,"q.espeed"] <- 3
ll<-which(res.imputation$completeObs[,"q.espeed"] > 55)
res.imputation$completeObs[ll,"q.espeed"] <- 55
We proceed to impute all NAs in our numerical variables that are stored in: res.imputation$completeObs
df[,vars_quantitatives] <- res.imputation$completeObs
vars_categorical<-names(df)[c(1,2,3,17,18,20,26,27,28,29,30,31,32)]
summary(df[,vars_categorical])
## f.vendor_id f.store_and_fwd_flag f.rate_code_id
## vendor_id_mobile : 973 flag-no :4605 rate_code_id_1 :4496
## vendor_id_verifone:3650 flag-yes: 18 rate_code_id_other: 127
##
##
## f.payment_type f.trip_type f.period
## credit card:2096 trip_street_hail:4511 period night :1642
## cash :2497 trip_dispatch : 112 period morning : 542
## no paid : 30 period valley :1260
## period afternoon:1179
## f.trip_distance_range f.passenger_groups f.extra
## trip_dist_long : 645 passenger_couple: 345 extra_no :2128
## trip_dist_medium: 986 passenger_group : 395 extra_yes:2495
## trip_dist_short :2930 passenger_single:3883
## NA's : 62
## f.mta_tax f.target.tip_is_given f.paid_tolls
## mta_no : 119 tip_no :2882 tolls_no :4576
## mta_yes:4504 tip_yes:1741 tolls_yes: 47
##
##
## f.improvement_surcharge
## improvement_no : 118
## improvement_yes:4505
##
##
res.input<-imputeMCA(df[,vars_categorical],ncp=10)
summary(res.input$completeObs)
## f.vendor_id f.store_and_fwd_flag f.rate_code_id
## vendor_id_mobile : 973 flag-no :4605 rate_code_id_1 :4496
## vendor_id_verifone:3650 flag-yes: 18 rate_code_id_other: 127
##
##
## f.payment_type f.trip_type f.period
## credit card:2096 trip_street_hail:4511 period night :1642
## cash :2497 trip_dispatch : 112 period morning : 542
## no paid : 30 period valley :1260
## period afternoon:1179
## f.trip_distance_range f.passenger_groups f.extra
## trip_dist_long : 650 passenger_couple: 345 extra_no :2128
## trip_dist_medium: 988 passenger_group : 395 extra_yes:2495
## trip_dist_short :2985 passenger_single:3883
##
## f.mta_tax f.target.tip_is_given f.paid_tolls
## mta_no : 119 tip_no :2882 tolls_no :4576
## mta_yes:4504 tip_yes:1741 tolls_yes: 47
##
##
## f.improvement_surcharge
## improvement_no : 118
## improvement_yes:4505
##
##
We proceed to impute all NAs in our numerical variables that are stored in: res.input$completeObs
df[,vars_categorical] <- res.input$completeObs
df$f.dist[df$q.trip_distance<=1.6] = "(0, 1.6]"
df$f.dist[(df$q.trip_distance>1.6) & (df$q.trip_distance<=3)] = "(1.6, 3]"
df$f.dist[(df$q.trip_distance>3) & (df$q.trip_distance<=5.5)] = "(3, 5.5]"
df$f.dist[(df$q.trip_distance>5.5) & (df$q.trip_distance<=30)] = "(5.5, 30]"
df$f.dist<-factor(df$f.dist)
df$f.hour[(df$q.hour>=17) & (df$q.hour<18)] = "17"
df$f.hour[(df$q.hour>=18) & (df$q.hour<19)] = "18"
df$f.hour[(df$q.hour>=19) & (df$q.hour<20)] = "19"
df$f.hour[(df$q.hour>=20) & (df$q.hour<21)] = "20"
df$f.hour[(df$q.hour>=21) & (df$q.hour<22)] = "21"
df$f.hour[(df$q.hour>=22) & (df$q.hour<23)] = "22"
df$f.hour[(df$q.hour<17)] = "other"
df$f.hour[(df$q.hour>=23)] = "other"
df$f.hour<-factor(df$f.hour)
df$f.espeed[(df$q.espeed>=3) & (df$q.espeed<20)] = "[03,20)"
df$f.espeed[(df$q.espeed>=20) & (df$q.espeed<40)] = "[20,40)"
df$f.espeed[(df$q.espeed>=40) & (df$q.espeed<=55)] = "[40,55]"
df$f.espeed<-factor(df$f.espeed)
We are skipping longitudes and latitudes.
library(mvoutlier)
library(FactoMineR)
vars_quantitatives_no_coords <- names(df)[c(8,9,10,11,12,13,14,15,16,21,22,23)]
res <- cor(df[,vars_quantitatives_no_coords])
round(res, 2)
## q.passenger_count q.trip_distance q.fare_amount q.extra
## q.passenger_count 1.00 0.02 0.01 0.05
## q.trip_distance 0.02 1.00 0.92 -0.05
## q.fare_amount 0.01 0.92 1.00 -0.06
## q.extra 0.05 -0.05 -0.06 1.00
## q.mta_tax 0.00 -0.08 -0.10 0.15
## q.tip_amount -0.01 0.42 0.42 0.01
## q.tolls_amount 0.02 0.20 0.20 -0.03
## q.improvement_surcharge 0.01 -0.07 -0.08 0.15
## q.target.total_amount 0.02 0.91 0.95 -0.01
## q.tlenkm 0.02 0.99 0.91 -0.05
## q.travel_time 0.00 0.11 0.12 0.03
## q.espeed 0.02 0.57 0.41 -0.05
## q.mta_tax q.tip_amount q.tolls_amount
## q.passenger_count 0.00 -0.01 0.02
## q.trip_distance -0.08 0.42 0.20
## q.fare_amount -0.10 0.42 0.20
## q.extra 0.15 0.01 -0.03
## q.mta_tax 1.00 0.04 0.01
## q.tip_amount 0.04 1.00 0.18
## q.tolls_amount 0.01 0.18 1.00
## q.improvement_surcharge 0.96 0.05 0.02
## q.target.total_amount -0.05 0.57 0.25
## q.tlenkm -0.04 0.41 0.21
## q.travel_time 0.01 0.02 0.00
## q.espeed -0.08 0.21 0.16
## q.improvement_surcharge q.target.total_amount q.tlenkm
## q.passenger_count 0.01 0.02 0.02
## q.trip_distance -0.07 0.91 0.99
## q.fare_amount -0.08 0.95 0.91
## q.extra 0.15 -0.01 -0.05
## q.mta_tax 0.96 -0.05 -0.04
## q.tip_amount 0.05 0.57 0.41
## q.tolls_amount 0.02 0.25 0.21
## q.improvement_surcharge 1.00 -0.03 -0.03
## q.target.total_amount -0.03 1.00 0.88
## q.tlenkm -0.03 0.88 1.00
## q.travel_time 0.01 0.11 0.11
## q.espeed -0.07 0.40 0.56
## q.travel_time q.espeed
## q.passenger_count 0.00 0.02
## q.trip_distance 0.11 0.57
## q.fare_amount 0.12 0.41
## q.extra 0.03 -0.05
## q.mta_tax 0.01 -0.08
## q.tip_amount 0.02 0.21
## q.tolls_amount 0.00 0.16
## q.improvement_surcharge 0.01 -0.07
## q.target.total_amount 0.11 0.40
## q.tlenkm 0.11 0.56
## q.travel_time 1.00 -0.14
## q.espeed -0.14 1.00
library(corrplot)
## corrplot 0.84 loaded
corrplot(res,method="square",type="upper",tl.col="black",tl.cex=0.75,)
As we can see in this graph, we have the correlation between all quantitative variables. We must say, however, that there are two variables (espeed and traveltime) which we had to modify when making the imputation.
Now, let’s describe each correlation we obtained in the graph (we will only mention one relation once): * Diagonals: Being exactly the same variable, it is directly related to itself. * q.passanger_count: not too related to any other not seen before * q.trip_distance + w/ q.fare_amount: More distance, more time, therefore more price. + w/ q.tip_amount: If the trip has been longer, there may be more reason to tip. + w/ q.target.total_amount: As before, more distance, more time, therefore more price. + w/ q.tlenkm: They are exactly the same, only with a metric change. + w/ q.travel_time: The further away, the longer. + w/ q.espeed: The reason we think these variables are related to a direct and positive proportion is that since short trips have to be, logically cheaper, what taxi drivers do is slow down so that the trip take longer and thus charge more. Therefore, by increasing the distance of the journey, taxi drivers do not need to go so slow and therefore the speed increases. * q.fare_amount: + w/ q.tip_amount: In the USA it is normal to give a tip proportional to the price of the service that has been offered. + w/ q.target.total_amount: The variable q.target.total_amount is equivalent to q.fare_amount plus the fees, tips, among others, that have been applied to the trip. + w/ q.tlenkm: As before, more distance, more time, therefore more price + w/ q.travel_time: More time, more price. + w/ q.espeed: As we said before, more speed means more distance, therefore more travel time, causing more price. * q.extra: not too related to any other not seen before * q.mta_tax: + w/ q.improvement_subcharge: if there’s a tax, the most probable thing to happen is that there’s an improvement subcharge too * q.tip_amount: + w/ q.target.total_amount: As before, in the USA it is normal to give a tip proportional to the price of the service that has been offered. + w/ q.tlenkm: If the trip has been longer, there may be more reason to tip. + w/ q.travel_time: The longer it takes, the more price, and therefore the more tip given the proportionality. + w/ q.espeed: The more speed, as we said before, the more distance, and therefore the longer it takes. This causes more price and therefore more tip. * q.tolls_amount: not too related to any other not seen before * q.improvement_subcharge: not too related to any other not seen before * q.target.total_amount: + w/ q.tlenkm: More distance, more time, therefore more price. + w/ q.espeed: As we said before, more speed means more distance, therefore more travel time, causing more price. * q.tlenkm + Same as for q.trip_distance + q.espeed correlation. * q.travel_time: not too related to any other not seen before * q.espeed: not too related to any other not seen before
library(mvoutlier)
library(chemometrics)
multivariant_outliers <- Moutlier(df[, c(9,10,16,23)], quantile = 0.995)
multivariant_outliers$cutoff
## [1] 3.854901
par(mfrow=c(1,1))
plot(multivariant_outliers$md, multivariant_outliers$rd, type="n")
text(multivariant_outliers$md, multivariant_outliers$rd, labels=rownames(df[, c(9,10,16,23)]), cex=0.5)
As we can see, above the defined line we have all the possible observations that we call multivariate outliers. These mean that, viewed only from the point of view of a variable, it does not have to be an outlier, but that viewed with various dimensions (variables), it may be so.
Anem a feer una mirada ràpida dels outliers multivariants més pronunciats per entendre com són:
df[which(row.names(df)=="1208612"), 1:35]
## f.vendor_id f.store_and_fwd_flag f.rate_code_id q.pickup_longitude
## 1208612 vendor_id_mobile flag-no rate_code_id_1 -73.9154
## q.pickup_latitude q.dropoff_longitude q.dropoff_latitude
## 1208612 40.65837 -73.91541 40.65837
## q.passenger_count q.trip_distance q.fare_amount q.extra q.mta_tax
## 1208612 1 7.828788 2.5 1 0.5
## q.tip_amount q.tolls_amount q.improvement_surcharge
## 1208612 3.164389 0 0.3
## q.target.total_amount f.payment_type f.trip_type q.hour
## 1208612 100.3 credit card trip_street_hail 18
## f.period q.tlenkm q.travel_time q.espeed qual.pickup
## 1208612 period afternoon 1 0.06666667 28.72946 18
## qual.dropoff f.trip_distance_range f.passenger_groups f.extra
## 1208612 18 trip_dist_short passenger_single extra_yes
## f.mta_tax f.target.tip_is_given f.paid_tolls f.improvement_surcharge
## 1208612 mta_yes tip_yes tolls_no improvement_yes
## f.dist f.hour f.espeed
## 1208612 (5.5, 30] 18 [20,40)
We can see that, for only 1km, the whole trip cost $100, and that’s not normal.
We are removing them from our dataset:
df <- subset(df, !(multivariant_outliers$md>10 | multivariant_outliers$rd>60))
multivariant_outliers <- Moutlier(df[, c(9,10,16,23)], quantile = 0.995)
par(mfrow=c(1,1))
plot(multivariant_outliers$md, multivariant_outliers$rd, type="n")
text(multivariant_outliers$md, multivariant_outliers$rd, labels=rownames(df[, c(9,10,16,23)]), cex=0.5)
We can see now that there are not multivariant outliers in our dataframe.
Profiling is used to finish profiling our sample.
We will now proceed to the profiling that asks us for our numeric target (Total_amount) and then we have to use the original variables and factors.
In order to observe the relationship of our numerical target with the other variables we use the condes tool that provides us with information about the relationships between the indicated variables and the target.
library(FactoMineR)
summary(df$q.target.total_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.80 10.56 13.64 16.80 77.16
vars_res<-names(df)[c(16,30)]
vars_quantitatives<-names(df)[c(8:15,21:23)]
vars_categorical<-names(df)[c(1:3,17,18,20,26:29,31,32)]
res.condes <- condes(df[, c(vars_res,vars_quantitatives, vars_categorical)],1)
Let’s now look at the correlations between our Total_amount target and the variables in the following groups. We will basically look at p.value, which we know that the smaller the correlation between the variables.
res.condes$quanti
## correlation p.value
## q.fare_amount 0.97485988 0.000000e+00
## q.trip_distance 0.93406225 0.000000e+00
## q.tlenkm 0.92187648 0.000000e+00
## q.tip_amount 0.56127714 0.000000e+00
## q.espeed 0.39613486 1.421071e-172
## q.tolls_amount 0.23802440 3.144377e-60
## q.travel_time 0.11290029 1.625434e-14
## q.mta_tax -0.03030698 3.990201e-02
For the lowest p.values:
res.condes$quali
## R2 p.value
## f.trip_distance_range 0.6827079230 0.000000e+00
## f.paid_tolls 0.0754334605 2.364785e-80
## f.target.tip_is_given 0.0657915150 5.688596e-70
## f.payment_type 0.0593330010 9.600215e-62
## f.rate_code_id 0.0038737420 2.412314e-05
## f.mta_tax 0.0009185128 3.990201e-02
For the lowest p.values:
res.condes$category
## Estimate p.value
## f.trip_distance_range=trip_dist_long 11.3373225 0.000000e+00
## f.paid_tolls=tolls_yes 12.8813000 2.364785e-80
## f.target.tip_is_given=tip_yes 2.3631665 5.688596e-70
## f.payment_type=credit card 2.2944595 8.798382e-63
## f.rate_code_id=rate_code_id_other 1.7783900 2.412314e-05
## f.period=period morning 0.6908169 2.778052e-02
## f.mta_tax=mta_no 0.8772005 3.990201e-02
## f.mta_tax=mta_yes -0.8772005 3.990201e-02
## f.rate_code_id=rate_code_id_1 -1.7783900 2.412314e-05
## f.trip_distance_range=trip_dist_medium -1.5397811 2.318709e-46
## f.payment_type=cash -2.0845145 1.489206e-62
## f.target.tip_is_given=tip_no -2.3631665 5.688596e-70
## f.paid_tolls=tolls_no -12.8813000 2.364785e-80
## f.trip_distance_range=trip_dist_short -9.7975414 0.000000e+00
For the lowest p.values:
And now, we are profiling the qualitative target:
res.catdes <- catdes(df[, c(vars_res,vars_quantitatives, vars_categorical)],2)
Let’s now look at the correlations between our f.target.tip_is_given target and the variables in the following groups. We will basically look at p.value, which we know that the smaller the correlation between the variables.
res.catdes$test.chi2
## p.value df
## f.payment_type 0.000000e+00 2
## f.trip_distance_range 2.785249e-23 2
## f.mta_tax 5.054079e-06 1
## f.improvement_surcharge 6.545475e-06 1
## f.trip_type 1.208825e-05 1
## f.rate_code_id 1.463909e-05 1
## f.period 5.138587e-05 3
## f.paid_tolls 3.327123e-04 1
For the lowest p.values:
res.catdes$quanti.var
## Eta2 P-value
## q.tip_amount 0.545641143 0.000000e+00
## q.target.total_amount 0.065791515 5.688596e-70
## q.fare_amount 0.015030865 7.324378e-17
## q.trip_distance 0.013289537 4.499661e-15
## q.tlenkm 0.013272638 4.683133e-15
## q.espeed 0.007569834 3.449643e-09
## q.mta_tax 0.004528319 4.960153e-06
## q.improvement_surcharge 0.004420639 6.430281e-06
## q.tolls_amount 0.003369017 8.228375e-05
For the lowest p.values:
res.catdes$category
## $tip_no
## Cla/Mod Mod/Cla Global
## f.payment_type=cash 100.00000 86.6852562 54.1005003
## f.trip_distance_range=trip_dist_short 67.73109 70.2335308 64.7161192
## f.payment_type=no paid 100.00000 1.0108052 0.6308462
## f.mta_tax=mta_no 83.03571 3.2415476 2.4363715
## f.improvement_surcharge=improvement_no 82.88288 3.2066922 2.4146182
## f.trip_type=trip_dispatch 82.85714 3.0324155 2.2840983
## f.rate_code_id=rate_code_id_other 81.73913 3.2764029 2.5016315
## f.period=period valley 67.30463 29.4179157 27.2786600
## f.paid_tolls=tolls_no 62.65642 99.4771697 99.0863607
## f.period=period morning 56.48148 10.6308818 11.7467914
## f.paid_tolls=tolls_yes 35.71429 0.5228303 0.9136393
## f.rate_code_id=rate_code_id_1 61.91432 96.7235971 97.4983685
## f.trip_type=trip_street_hail 61.93232 96.9675845 97.7159017
## f.improvement_surcharge=improvement_yes 61.90370 96.7933078 97.5853818
## f.mta_tax=mta_yes 61.89521 96.7584524 97.5636285
## f.trip_distance_range=trip_dist_medium 54.05680 18.5779017 21.4487709
## f.trip_distance_range=trip_dist_long 50.47170 11.1885674 13.8351099
## f.payment_type=credit card 16.96300 12.3039387 45.2686535
## p.value v.test
## f.payment_type=cash 0.000000e+00 Inf
## f.trip_distance_range=trip_dist_short 1.081649e-23 10.033894
## f.payment_type=no paid 1.094456e-06 4.873847
## f.mta_tax=mta_no 1.634763e-06 4.794015
## f.improvement_surcharge=improvement_no 2.209458e-06 4.733257
## f.trip_type=trip_dispatch 4.384881e-06 4.592252
## f.rate_code_id=rate_code_id_other 5.893910e-06 4.530160
## f.period=period valley 2.460929e-05 4.218353
## f.paid_tolls=tolls_no 5.031113e-04 3.479094
## f.period=period morning 2.667731e-03 -3.003637
## f.paid_tolls=tolls_yes 5.031113e-04 -3.479094
## f.rate_code_id=rate_code_id_1 5.893910e-06 -4.530160
## f.trip_type=trip_street_hail 4.384881e-06 -4.592252
## f.improvement_surcharge=improvement_yes 2.209458e-06 -4.733257
## f.mta_tax=mta_yes 1.634763e-06 -4.794015
## f.trip_distance_range=trip_dist_medium 1.393474e-09 -6.056232
## f.trip_distance_range=trip_dist_long 3.929634e-11 -6.606707
## f.payment_type=credit card 0.000000e+00 -Inf
##
## $tip_yes
## Cla/Mod Mod/Cla Global
## f.payment_type=credit card 83.03700 100.000000 45.2686535
## f.trip_distance_range=trip_dist_long 49.52830 18.229167 13.8351099
## f.trip_distance_range=trip_dist_medium 45.94320 26.215278 21.4487709
## f.mta_tax=mta_yes 38.10479 98.900463 97.5636285
## f.improvement_surcharge=improvement_yes 38.09630 98.900463 97.5853818
## f.trip_type=trip_street_hail 38.06768 98.958333 97.7159017
## f.rate_code_id=rate_code_id_1 38.08568 98.784722 97.4983685
## f.paid_tolls=tolls_yes 64.28571 1.562500 0.9136393
## f.period=period morning 43.51852 13.599537 11.7467914
## f.paid_tolls=tolls_no 37.34358 98.437500 99.0863607
## f.period=period valley 32.69537 23.726852 27.2786600
## f.rate_code_id=rate_code_id_other 18.26087 1.215278 2.5016315
## f.trip_type=trip_dispatch 17.14286 1.041667 2.2840983
## f.improvement_surcharge=improvement_no 17.11712 1.099537 2.4146182
## f.mta_tax=mta_no 16.96429 1.099537 2.4363715
## f.payment_type=no paid 0.00000 0.000000 0.6308462
## f.trip_distance_range=trip_dist_short 32.26891 55.555556 64.7161192
## f.payment_type=cash 0.00000 0.000000 54.1005003
## p.value v.test
## f.payment_type=credit card 0.000000e+00 Inf
## f.trip_distance_range=trip_dist_long 3.929634e-11 6.606707
## f.trip_distance_range=trip_dist_medium 1.393474e-09 6.056232
## f.mta_tax=mta_yes 1.634763e-06 4.794015
## f.improvement_surcharge=improvement_yes 2.209458e-06 4.733257
## f.trip_type=trip_street_hail 4.384881e-06 4.592252
## f.rate_code_id=rate_code_id_1 5.893910e-06 4.530160
## f.paid_tolls=tolls_yes 5.031113e-04 3.479094
## f.period=period morning 2.667731e-03 3.003637
## f.paid_tolls=tolls_no 5.031113e-04 -3.479094
## f.period=period valley 2.460929e-05 -4.218353
## f.rate_code_id=rate_code_id_other 5.893910e-06 -4.530160
## f.trip_type=trip_dispatch 4.384881e-06 -4.592252
## f.improvement_surcharge=improvement_no 2.209458e-06 -4.733257
## f.mta_tax=mta_no 1.634763e-06 -4.794015
## f.payment_type=no paid 1.094456e-06 -4.873847
## f.trip_distance_range=trip_dist_short 1.081649e-23 -10.033894
## f.payment_type=cash 0.000000e+00 -Inf
df$f.store_and_fwd_flag <- NULL
names(df)
## [1] "f.vendor_id" "f.rate_code_id"
## [3] "q.pickup_longitude" "q.pickup_latitude"
## [5] "q.dropoff_longitude" "q.dropoff_latitude"
## [7] "q.passenger_count" "q.trip_distance"
## [9] "q.fare_amount" "q.extra"
## [11] "q.mta_tax" "q.tip_amount"
## [13] "q.tolls_amount" "q.improvement_surcharge"
## [15] "q.target.total_amount" "f.payment_type"
## [17] "f.trip_type" "q.hour"
## [19] "f.period" "q.tlenkm"
## [21] "q.travel_time" "q.espeed"
## [23] "qual.pickup" "qual.dropoff"
## [25] "f.trip_distance_range" "f.passenger_groups"
## [27] "f.extra" "f.mta_tax"
## [29] "f.target.tip_is_given" "f.paid_tolls"
## [31] "f.improvement_surcharge" "f.dist"
## [33] "f.hour" "f.espeed"
vars_res<-names(df)[c(15,29)]
vars_quantitatives<-names(df)[c(3,4,5,6,7,8,9,10,12,20,21,22)]
vars_categorical<-names(df)[c(1,2,16,17,19,25,30)]
We have already seen profiling in the previous installment. So now, let’s proceed to look at the main components.
library(FactoMineR)
res.pca <- PCA(df[,c(1,2,3,4,5,6,7,8,9,10,12,13,15,16,17,19,21,22,25,26,29)],quanti.sup=c(3:6,13),quali.sup=c(1,2,14:16,19:21))
As we know, those variables that have an angle of 90 degrees, are not related. Taking a first look at the PCA obtained, we see that, for example, q.extra and q.travel_time are not at all related. On the other hand, also looking at q.travel_time, we see that it is very positively related to q.extra. If there were a variable that went in the opposite direction, we would say that it is inversely related.
We deleted the multivariant outliers.
Eigenvalues correspond to the amount of the variation explained by each principal component (PC). Eigenvalues are large for the first PC and small for the subsequent PCs.
A PC with an eigenvalue > 1 indicates that the PC accounts for more variance than accounted by one of the original variables in standardized data. This is commonly used as a cutoff point to determine the number of PCs to retain, using the Kaiser criteria.
eigenvalues <- res.pca$eig
head(eigenvalues[, 1:3])
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.6119054 32.648818 32.64882
## comp 2 1.1131628 13.914535 46.56335
## comp 3 1.0407522 13.009402 59.57276
## comp 4 0.9480904 11.851130 71.42389
## comp 5 0.9313478 11.641847 83.06573
## comp 6 0.7881854 9.852317 92.91805
In this case, then, we will use up to dimension 3, and they will explain 59.57% of the total inertia.
As a brief definition, we would say that the elbow rule is based on selecting dimensions until the difference in variance of that of the next factorial plane is almost the same as that of the current plane.
So let’s look at exactly where we have this minimal difference:
fviz_screeplot(res.pca,addlabels=TRUE,ylim=c(0,50),barfill="darkslateblue",barcolor="darkslateblue",linecolor="skyblue1")
We could say, then, that there is little difference between dimension 3 and 4, or between 5 and 6. Therefore, we could be left with 3 dimensions (as with Kasier) or 5.
fviz_pca_ind(res.pca, col.ind="contrib", geom = "point") +
scale_color_gradient2(low="darkslateblue", mid="white",
high="red", midpoint=0.40)
We can see that there are some individuals that are too contributive. So now, let’s try to understand them better with extreme individuals.
rang<-order(res.pca$ind$coord[,1])
contrib.extremes<-c(row.names(df)[rang[1]], row.names(df)[rang[length(rang)]])
contrib.extremes<-c(row.names(df)[rang[1:10]], row.names(df)[rang[(length(rang)-10):length(rang)]])
fviz_pca_ind(res.pca, select.ind = list(names=contrib.extremes))
We can now have a look at them:
df[which(row.names(df) %in% row.names(df)[rang[length(rang)]]), 1:34]
## f.vendor_id f.rate_code_id q.pickup_longitude q.pickup_latitude
## 710390 vendor_id_verifone rate_code_id_1 -73.93688 40.81975
## q.dropoff_longitude q.dropoff_latitude q.passenger_count q.trip_distance
## 710390 -73.84977 40.67285 1 19
## q.fare_amount q.extra q.mta_tax q.tip_amount q.tolls_amount
## 710390 50.5 0.5 0.5 11.47 5.54
## q.improvement_surcharge q.target.total_amount f.payment_type
## 710390 0.3 68.81 credit card
## f.trip_type q.hour f.period q.tlenkm q.travel_time q.espeed
## 710390 trip_street_hail 23 period night 30.57754 30.53333 55
## qual.pickup qual.dropoff f.trip_distance_range f.passenger_groups
## 710390 23 00 trip_dist_long passenger_single
## f.extra f.mta_tax f.target.tip_is_given f.paid_tolls
## 710390 extra_yes mta_yes tip_yes tolls_yes
## f.improvement_surcharge f.dist f.hour f.espeed
## 710390 improvement_yes (5.5, 30] other [40,55]
df[which(row.names(df) %in% row.names(df)[rang[1]]),1:34]
## f.vendor_id f.rate_code_id q.pickup_longitude q.pickup_latitude
## 25458 vendor_id_verifone rate_code_id_1 -73.896 40.85568
## q.dropoff_longitude q.dropoff_latitude q.passenger_count q.trip_distance
## 25458 -73.89645 40.85497 1 0.05
## q.fare_amount q.extra q.mta_tax q.tip_amount q.tolls_amount
## 25458 3 0.5 0.5 0 0
## q.improvement_surcharge q.target.total_amount f.payment_type
## 25458 0.3 4.3 cash
## f.trip_type q.hour f.period q.tlenkm q.travel_time q.espeed
## 25458 trip_street_hail 4 period night 1 1.35 3.57632
## qual.pickup qual.dropoff f.trip_distance_range f.passenger_groups
## 25458 04 04 trip_dist_short passenger_single
## f.extra f.mta_tax f.target.tip_is_given f.paid_tolls
## 25458 extra_yes mta_yes tip_no tolls_no
## f.improvement_surcharge f.dist f.hour f.espeed
## 25458 improvement_yes (0, 1.6] other [03,20)
!! FALTA DESCRIPCIÓ
rang<-order(res.pca$ind$coord[,2])
contrib.extremes<-c(row.names(df)[rang[1]], row.names(df)[rang[length(rang)]])
contrib.extremes<-c(row.names(df)[rang[1:10]], row.names(df)[rang[(length(rang)-10):length(rang)]])
fviz_pca_ind(res.pca, select.ind = list(names=contrib.extremes))
We can now have a look at them:
df[which(row.names(df) %in% row.names(df)[rang[length(rang)]]), 1:34]
## f.vendor_id f.rate_code_id q.pickup_longitude q.pickup_latitude
## 513170 vendor_id_verifone rate_code_id_1 -73.91908 40.75881
## q.dropoff_longitude q.dropoff_latitude q.passenger_count q.trip_distance
## 513170 -73.90479 40.77545 5 1.47
## q.fare_amount q.extra q.mta_tax q.tip_amount q.tolls_amount
## 513170 8 1 0.5 0 0
## q.improvement_surcharge q.target.total_amount f.payment_type
## 513170 0.3 9.8 cash
## f.trip_type q.hour f.period q.tlenkm q.travel_time q.espeed
## 513170 trip_street_hail 18 period afternoon 2.365736 1435.633 3
## qual.pickup qual.dropoff f.trip_distance_range f.passenger_groups
## 513170 18 18 trip_dist_short passenger_group
## f.extra f.mta_tax f.target.tip_is_given f.paid_tolls
## 513170 extra_yes mta_yes tip_no tolls_no
## f.improvement_surcharge f.dist f.hour f.espeed
## 513170 improvement_yes (0, 1.6] 18 [03,20)
df[which(row.names(df) %in% row.names(df)[rang[1]]),1:34]
## f.vendor_id f.rate_code_id q.pickup_longitude q.pickup_latitude
## 37238 vendor_id_verifone rate_code_id_1 -73.94037 40.79722
## q.dropoff_longitude q.dropoff_latitude q.passenger_count q.trip_distance
## 37238 -73.87116 40.77416 1 6.29
## q.fare_amount q.extra q.mta_tax q.tip_amount q.tolls_amount
## 37238 19 0 0.5 5.07 5.54
## q.improvement_surcharge q.target.total_amount f.payment_type
## 37238 0.3 30.41 credit card
## f.trip_type q.hour f.period q.tlenkm q.travel_time q.espeed
## 37238 trip_street_hail 9 period morning 10.12277 11.3 53.74924
## qual.pickup qual.dropoff f.trip_distance_range f.passenger_groups
## 37238 09 09 trip_dist_long passenger_single
## f.extra f.mta_tax f.target.tip_is_given f.paid_tolls
## 37238 extra_no mta_yes tip_yes tolls_yes
## f.improvement_surcharge f.dist f.hour f.espeed
## 37238 improvement_yes (5.5, 30] other [40,55]
!! FALTA DESCRIPCIÓ
Since we’ve commented before that we don’t consider multivariate outliers, no action should be taken here.
res.des <- dimdesc(res.pca)
fviz_contrib(res.pca,fill="darkslateblue",color="darkslateblue",choice="var",axes=1,top=5)
res.des$Dim.1
## $quanti
## correlation p.value
## q.trip_distance 0.95322388 0.000000e+00
## q.target.total_amount 0.93605843 0.000000e+00
## q.fare_amount 0.91080652 0.000000e+00
## q.espeed 0.65229084 0.000000e+00
## q.tip_amount 0.56831521 0.000000e+00
## q.tolls_amount 0.33080214 8.179942e-118
## q.travel_time 0.08215764 2.428778e-08
## q.dropoff_longitude -0.04932166 8.222526e-04
## q.pickup_latitude -0.08911658 1.425162e-09
## q.extra -0.09078698 6.980586e-10
## q.dropoff_latitude -0.11194676 2.698967e-14
##
## $quali
## R2 p.value
## f.trip_distance_range 0.734487633 0.000000e+00
## f.target.tip_is_given 0.074640721 1.703060e-79
## f.payment_type 0.067324252 2.960615e-70
## f.rate_code_id 0.008800369 1.856377e-10
## f.period 0.009696196 1.042195e-09
## f.trip_type 0.003237363 1.134716e-04
##
## $category
## Estimate p.value
## f.trip_distance_range=trip_dist_long 2.19005241 0.000000e+00
## f.target.tip_is_given=tip_yes 0.45579967 1.703060e-79
## f.payment_type=credit card 0.43231989 3.206294e-71
## f.rate_code_id=rate_code_id_other 0.48538778 1.856377e-10
## f.period=period night 0.12392590 1.547427e-05
## f.trip_type=trip_dispatch 0.30775450 1.134716e-04
## f.period=period morning 0.18859438 1.837635e-03
## f.passenger_groups=passenger_group 0.08918874 4.311506e-02
## f.passenger_groups=passenger_single -0.09080622 3.224384e-02
## f.trip_type=trip_street_hail -0.30775450 1.134716e-04
## f.period=period afternoon -0.23755523 4.454839e-08
## f.rate_code_id=rate_code_id_1 -0.48538778 1.856377e-10
## f.trip_distance_range=trip_dist_medium -0.37566812 3.645498e-37
## f.payment_type=cash -0.41235661 4.308884e-71
## f.target.tip_is_given=tip_no -0.45579967 1.703060e-79
## f.trip_distance_range=trip_dist_short -1.81438428 0.000000e+00
##
## attr(,"class")
## [1] "condes" "list "
In the first dimension we see that for the quantitative variables the most positively related, from more to less, are:
If we take look at the qualitatives ones, we that the most related is
Finally, if we take a look at the categories we see that for the f.trip_distance_range=trip_dist_long category long distance trips show a mean 2.19 units over the global mean and f.trip_distance_range=trip_dist_short show a mean -1.81 units under the global mean, so we can reject the H0 done in the t.Student test.
fviz_contrib(res.pca,fill="darkslateblue",color="darkslateblue",choice="var",axes=2,top=5)
res.des$Dim.2
## $quanti
## correlation p.value
## q.travel_time 0.84015337 0.000000e+00
## q.extra 0.44066626 9.995666e-218
## q.target.total_amount 0.15604039 1.898171e-26
## q.passenger_count 0.15381486 9.760185e-26
## q.fare_amount 0.13635896 1.599137e-20
## q.tip_amount 0.10527571 8.320588e-13
## q.trip_distance 0.06726571 5.003894e-06
## q.pickup_longitude -0.02948244 4.562621e-02
## q.pickup_latitude -0.07450801 4.250392e-07
## q.dropoff_latitude -0.08092222 3.926148e-08
## q.tolls_amount -0.09915740 1.608358e-11
## q.espeed -0.38134730 4.831181e-159
##
## $quali
## R2 p.value
## f.period 0.056218287 2.524117e-57
## f.passenger_groups 0.019697707 1.425485e-20
## f.rate_code_id 0.007407674 5.072925e-09
## f.trip_type 0.006690333 2.799099e-08
## f.target.tip_is_given 0.002571513 5.828298e-04
## f.payment_type 0.003030426 9.382853e-04
## f.vendor_id 0.001994394 2.456928e-03
## f.trip_distance_range 0.002488277 3.270886e-03
##
## $category
## Estimate p.value
## f.period=period afternoon 0.41767959 1.191658e-45
## f.passenger_groups=passenger_group 0.29359495 5.139154e-20
## f.rate_code_id=rate_code_id_1 0.29072337 5.072925e-09
## f.trip_type=trip_street_hail 0.28882396 2.799099e-08
## f.payment_type=credit card 0.13643754 3.062510e-04
## f.target.tip_is_given=tip_yes 0.05523076 5.828298e-04
## f.payment_type=cash 0.02579756 6.429713e-04
## f.vendor_id=vendor_id_verifone 0.05780466 2.456928e-03
## f.trip_distance_range=trip_dist_long 0.06785618 6.954837e-03
## f.vendor_id=vendor_id_mobile -0.05780466 2.456928e-03
## f.trip_distance_range=trip_dist_short -0.07344640 1.484587e-03
## f.target.tip_is_given=tip_no -0.05523076 5.828298e-04
## f.trip_type=trip_dispatch -0.28882396 2.799099e-08
## f.rate_code_id=rate_code_id_other -0.29072337 5.072925e-09
## f.period=period morning -0.30341857 2.779501e-16
## f.passenger_groups=passenger_single -0.22722700 6.952138e-17
## f.period=period valley -0.17206022 1.211949e-17
##
## attr(,"class")
## [1] "condes" "list "
For the second dimension we see that or the quantitative variables q.travel_time and q.extra are the most positively related ones with 0.84 and 0.44 respectively.
If we see the qualitative variables we notice that period is the most related with 0.06 even though it is not a very remarkable data.
And we see that for this category, f.period=period afternoon mean is 0.42 units over the global mean and f.period=period valley mean, on the contrary, is -0.17 units under the global mean, so we can reject the H0 done in the t.Student test.
fviz_contrib(res.pca,fill="darkslateblue",color="darkslateblue",choice="var",axes=3,top=5)
res.des$Dim.3
## $quanti
## correlation p.value
## q.passenger_count 0.75767528 0.000000e+00
## q.extra 0.54954392 0.000000e+00
## q.tolls_amount 0.15465732 5.266327e-26
## q.espeed 0.14925195 2.597462e-24
## q.tip_amount 0.03961839 7.220541e-03
## q.dropoff_longitude 0.03929744 7.705295e-03
## q.pickup_longitude 0.02912336 4.832721e-02
## q.target.total_amount -0.03059885 3.802736e-02
## q.trip_distance -0.04336717 3.272292e-03
## q.dropoff_latitude -0.05848745 7.252815e-05
## q.pickup_latitude -0.06027622 4.324795e-05
## q.fare_amount -0.08164812 2.963315e-08
## q.travel_time -0.32919610 1.264003e-116
##
## $quali
## R2 p.value
## f.passenger_groups 0.521044760 0.000000e+00
## f.period 0.129510618 9.050501e-138
## f.trip_type 0.006375189 5.934821e-08
## f.rate_code_id 0.005819348 2.238819e-07
## f.target.tip_is_given 0.003259450 1.075109e-04
## f.vendor_id 0.002241084 1.324394e-03
## f.payment_type 0.001868579 1.362101e-02
## f.trip_distance_range 0.001612683 2.454314e-02
##
## $category
## Estimate p.value
## f.passenger_groups=passenger_group 1.48059745 0.000000e+00
## f.period=period afternoon 0.48310494 1.302958e-51
## f.period=period night 0.30757870 4.850300e-25
## f.trip_type=trip_street_hail 0.27261527 5.934821e-08
## f.rate_code_id=rate_code_id_1 0.24915550 2.238819e-07
## f.target.tip_is_given=tip_yes 0.06012474 1.075109e-04
## f.vendor_id=vendor_id_verifone 0.05924895 1.324394e-03
## f.trip_distance_range=trip_dist_short 0.05546595 6.885854e-03
## f.payment_type=credit card 0.14134887 1.087463e-02
## f.payment_type=cash 0.06755819 2.234684e-02
## f.trip_distance_range=trip_dist_medium -0.03644684 3.159981e-02
## f.vendor_id=vendor_id_mobile -0.05924895 1.324394e-03
## f.target.tip_is_given=tip_no -0.06012474 1.075109e-04
## f.rate_code_id=rate_code_id_other -0.24915550 2.238819e-07
## f.trip_type=trip_dispatch -0.27261527 5.934821e-08
## f.passenger_groups=passenger_couple -0.36269220 1.494004e-19
## f.period=period valley -0.26011474 1.737679e-49
## f.period=period morning -0.53056890 6.433404e-54
## f.passenger_groups=passenger_single -1.11790526 0.000000e+00
##
## attr(,"class")
## [1] "condes" "list "
For the last dimension we took into account, the third one, we see that the most related quantitative variables are:
For the inversely related one, we also see that traveltime time (-0.32).
For the quanlitatives, we see that f.passenger_groups is the category that is more related with 0.52.
And we see that for this category, f.passenger_groups=passenger_group is 1.48 units over the global mean, and f.passenger_groups=passenger_single, on the contrary, is -1.12 units under the global mean.
We can conclude, then, that the first dimension is the one with the biggest correlations to the target
We want to take analyze the supplementary factor kind of rate, so we want to add lines that join the categories of this factor for the first factorial plane. With the following plot we can see it.
plot(res.pca$ind$coord[,1],res.pca$ind$coord[,2],pch=19,col="grey30")
points(res.pca$quali.sup$coord[,1],res.pca$quali.sup$coord[,2],pch=15,col="cadetblue1")
lines(res.pca$quali.sup$coord[3:4,1],res.pca$quali.sup$coord[3:4,2],lwd=2,lty=2,col="coral")
text(res.pca$quali.sup$coord[,1],res.pca$quali.sup$coord[,2],labels=names(res.pca$quali.sup$coord[,1]),col="cadetblue1",cex=0.5)
!!!!!! On queda AnyTip?
res.hcpc <- HCPC(res.pca,nb.clust=5, order=TRUE)
Note: If we chose the default number of cluster it would be 3, as we can guess from the inertia reduction plot, that follows the Elbow’s rule (number of black lines plus 1). In our case, due to the amount of data we have, the reason why we chose 5 as the number of clusters is because, after trying different numbers, we thought it was the best way to distribute the data.
Number of observations in each cluster:
table(res.hcpc$data.clust$clust)
##
## 1 2 3 4 5
## 3407 331 25 799 35
barplot(table(res.hcpc$data.clust$clust), col="darkslateblue", border="darkslateblue", main="[hierarchical] #observations/cluster")
res.hcpc$desc.var$test.chi2
## p.value df
## f.trip_distance_range 0.000000e+00 8
## f.passenger_groups 0.000000e+00 8
## f.target.tip_is_given 3.843439e-42 4
## f.payment_type 1.138510e-31 8
## f.period 2.824610e-09 12
## f.vendor_id 3.699900e-07 4
## f.rate_code_id 1.897208e-05 4
## f.trip_type 1.586841e-03 4
We start wit the description of the categorical variables that characterize the clusters, so in this output we do not have dimensions because it is the total association. We can see the intensity of the variables, in our case the variables that affect more to the clustering are f.trip_distance_range and f.passenger_groups because are the one with the smallest p.value. The variables associated to the clusters are the ones that appear on the output.
Next, we want to see for each cluster which are the categories that characterize them.The clusters that contain more individuals are the first, the second and the fourth one. Cluster number 4 has less individuals. We proceed to analyze them.
res.hcpc$desc.var$category
## $`1`
## Cla/Mod Mod/Cla Global
## f.trip_distance_range=trip_dist_short 91.4621849 79.8649839 64.716119
## f.passenger_groups=passenger_single 80.9844560 91.7522747 83.967805
## f.target.tip_is_given=tip_no 80.0278843 67.3906663 62.410268
## f.payment_type=cash 80.3377563 58.6439683 54.100500
## f.period=period afternoon 77.3890785 26.6216613 25.494888
## f.rate_code_id=rate_code_id_1 74.4310576 97.9160552 97.498369
## f.rate_code_id=rate_code_id_other 61.7391304 2.0839448 2.501631
## f.trip_distance_range=trip_dist_medium 68.9655172 19.9589081 21.448771
## f.period=period night 70.2023299 33.6072791 35.479661
## f.payment_type=credit card 66.6025949 40.6809510 45.268653
## f.target.tip_is_given=tip_yes 64.2939815 32.6093337 37.589732
## f.passenger_groups=passenger_group 4.0712468 0.4696214 8.549054
## f.trip_distance_range=trip_dist_long 0.9433962 0.1761080 13.835110
## p.value v.test
## f.trip_distance_range=trip_dist_short 5.224477e-287 36.204222
## f.passenger_groups=passenger_single 1.619909e-115 22.845501
## f.target.tip_is_given=tip_no 1.797897e-31 11.670769
## f.payment_type=cash 1.441748e-25 10.451534
## f.period=period afternoon 2.793760e-03 2.989564
## f.rate_code_id=rate_code_id_1 3.229373e-03 2.945017
## f.rate_code_id=rate_code_id_other 3.229373e-03 -2.945017
## f.trip_distance_range=trip_dist_medium 4.021999e-05 -4.106212
## f.period=period night 8.203878e-06 -4.459793
## f.payment_type=credit card 4.662918e-26 -10.558041
## f.target.tip_is_given=tip_yes 1.797897e-31 -11.670769
## f.passenger_groups=passenger_group 1.144810e-217 -31.491244
## f.trip_distance_range=trip_dist_long 0.000000e+00 -Inf
##
## $`2`
## Cla/Mod Mod/Cla Global
## f.passenger_groups=passenger_group 84.223919 100.0000000 8.549054
## f.vendor_id=vendor_id_verifone 8.209366 90.0302115 78.964542
## f.period=period night 9.012876 44.4108761 35.479661
## f.trip_distance_range=trip_dist_short 7.798319 70.0906344 64.716119
## f.rate_code_id=rate_code_id_1 7.318162 99.0936556 97.498369
## f.rate_code_id=rate_code_id_other 2.608696 0.9063444 2.501631
## f.period=period valley 5.582137 21.1480363 27.278660
## f.trip_distance_range=trip_dist_long 4.716981 9.0634441 13.835110
## f.period=period morning 3.148148 5.1359517 11.746791
## f.vendor_id=vendor_id_mobile 3.412616 9.9697885 21.035458
## f.passenger_groups=passenger_couple 0.000000 0.0000000 7.483141
## f.passenger_groups=passenger_single 0.000000 0.0000000 83.967805
## p.value v.test
## f.passenger_groups=passenger_group 0.000000e+00 Inf
## f.vendor_id=vendor_id_verifone 3.520712e-08 5.513348
## f.period=period night 5.133572e-04 3.473688
## f.trip_distance_range=trip_dist_short 3.234128e-02 2.140167
## f.rate_code_id=rate_code_id_1 3.762300e-02 2.078939
## f.rate_code_id=rate_code_id_other 3.762300e-02 -2.078939
## f.period=period valley 8.080026e-03 -2.648707
## f.trip_distance_range=trip_dist_long 6.553553e-03 -2.718718
## f.period=period morning 2.331841e-05 -4.230489
## f.vendor_id=vendor_id_mobile 3.520712e-08 -5.513348
## f.passenger_groups=passenger_couple 2.397295e-12 -7.009174
## f.passenger_groups=passenger_single 4.454824e-297 -36.838586
##
## $`3`
## Cla/Mod Mod/Cla Global p.value
## f.vendor_id=vendor_id_verifone 0.6887052 100 78.96454 0.002680541
## f.trip_distance_range=trip_dist_long 1.2578616 32 13.83511 0.020452911
## f.target.tip_is_given=tip_no 0.7319624 84 62.41027 0.021898539
## f.target.tip_is_given=tip_yes 0.2314815 16 37.58973 0.021898539
## f.vendor_id=vendor_id_mobile 0.0000000 0 21.03546 0.002680541
## v.test
## f.vendor_id=vendor_id_verifone 3.002179
## f.trip_distance_range=trip_dist_long 2.317934
## f.target.tip_is_given=tip_no 2.292123
## f.target.tip_is_given=tip_yes -2.292123
## f.vendor_id=vendor_id_mobile -3.002179
##
## $`4`
## Cla/Mod Mod/Cla Global
## f.trip_distance_range=trip_dist_long 88.522013 70.463079 13.835110
## f.target.tip_is_given=tip_yes 26.909722 58.197747 37.589732
## f.payment_type=credit card 24.699664 64.330413 45.268653
## f.trip_distance_range=trip_dist_medium 23.022312 28.410513 21.448771
## f.rate_code_id=rate_code_id_other 34.782609 5.006258 2.501631
## f.trip_type=trip_dispatch 31.428571 4.130163 2.284098
## f.period=period night 19.497241 39.799750 35.479661
## f.passenger_groups=passenger_couple 21.802326 9.386733 7.483141
## f.vendor_id=vendor_id_mobile 19.544984 23.654568 21.035458
## f.vendor_id=vendor_id_verifone 16.804408 76.345432 78.964542
## f.trip_type=trip_street_hail 17.052538 95.869837 97.715902
## f.passenger_groups=passenger_group 9.923664 4.881101 8.549054
## f.period=period afternoon 13.310580 19.524406 25.494888
## f.rate_code_id=rate_code_id_1 16.934404 94.993742 97.498369
## f.payment_type=cash 11.258544 35.043805 54.100500
## f.target.tip_is_given=tip_no 11.641687 41.802253 62.410268
## f.trip_distance_range=trip_dist_short 0.302521 1.126408 64.716119
## p.value v.test
## f.trip_distance_range=trip_dist_long 0.000000e+00 Inf
## f.target.tip_is_given=tip_yes 9.404862e-39 13.020099
## f.payment_type=credit card 1.071312e-32 11.908306
## f.trip_distance_range=trip_dist_medium 2.821586e-07 5.134990
## f.rate_code_id=rate_code_id_other 5.625382e-06 4.540002
## f.trip_type=trip_dispatch 3.860221e-04 3.549460
## f.period=period night 5.274367e-03 2.789781
## f.passenger_groups=passenger_couple 2.814339e-02 2.195282
## f.vendor_id=vendor_id_mobile 4.775977e-02 1.979500
## f.vendor_id=vendor_id_verifone 4.775977e-02 -1.979500
## f.trip_type=trip_street_hail 3.860221e-04 -3.549460
## f.passenger_groups=passenger_group 1.583928e-05 -4.316681
## f.period=period afternoon 1.362414e-05 -4.349833
## f.rate_code_id=rate_code_id_1 5.625382e-06 -4.540002
## f.payment_type=cash 1.087737e-32 -11.907037
## f.target.tip_is_given=tip_no 9.404862e-39 -13.020099
## f.trip_distance_range=trip_dist_short 0.000000e+00 -Inf
##
## $`5`
## Cla/Mod Mod/Cla Global p.value
## f.trip_distance_range=trip_dist_long 4.5597484 82.85714 13.83511 5.135380e-20
## f.payment_type=credit card 1.3455070 80.00000 45.26865 3.148521e-05
## f.target.tip_is_given=tip_yes 1.3888889 68.57143 37.58973 2.353494e-04
## f.period=period morning 2.0370370 31.42857 11.74679 1.966052e-03
## f.target.tip_is_given=tip_no 0.3834089 31.42857 62.41027 2.353494e-04
## f.payment_type=cash 0.2814636 20.00000 54.10050 4.329649e-05
## f.trip_distance_range=trip_dist_short 0.0000000 0.00000 64.71612 1.150987e-16
## v.test
## f.trip_distance_range=trip_dist_long 9.161136
## f.payment_type=credit card 4.162449
## f.target.tip_is_given=tip_yes 3.677697
## f.period=period morning 3.095313
## f.target.tip_is_given=tip_no -3.677697
## f.payment_type=cash -4.089146
## f.trip_distance_range=trip_dist_short -8.288072
!!! FALTA EXPLICACIÓ CLUSTERS
We now proceed to see the quantitative variables that characterizes the clusters.
res.hcpc$desc.var$quanti.var
## Eta2 P-value
## q.passenger_count 0.778208444 0.000000e+00
## q.trip_distance 0.597538632 0.000000e+00
## q.fare_amount 0.552214922 0.000000e+00
## q.tolls_amount 0.986622171 0.000000e+00
## q.target.total_amount 0.572312116 0.000000e+00
## q.travel_time 0.986512400 0.000000e+00
## q.espeed 0.319322275 0.000000e+00
## q.tip_amount 0.203833432 2.378012e-225
## q.extra 0.012557438 7.479488e-12
## q.dropoff_latitude 0.006478004 5.250277e-06
## q.dropoff_longitude 0.005207690 8.056275e-05
## q.pickup_latitude 0.003908569 1.241208e-03
We can see in the output that all the variables that appear are slightly over represented in the clusters. We can notice that the greatest represented is the q.passenger_count with 0.78 units over the global mean, we can also remark the q.trip_distance with 0.60 units over the mean and the q.fare_amount variable with 0.55 units over the mean.
The least over represented are the q.pickup_latitude with 0.004 units over the mean, the q.dropoff_longitude with 0.005 units over the mean, the dropoff_latitude with 0.006 units over the mean and the q.extra with 0.013 units over the total mean.
We want to know now which variables are associated with the quantitative variables.
res.hcpc$desc.var$quanti
## $`1`
## v.test Mean in category Overall mean sd in category
## q.dropoff_latitude 4.510624 40.7471697 40.74501576 0.05531320
## q.dropoff_longitude 3.571509 -73.9335631 -73.93493301 0.04192871
## q.pickup_latitude 3.115068 40.7482900 40.74679725 0.05477564
## q.extra 2.135227 0.3595539 0.35273004 0.37340661
## q.tolls_amount -10.379735 0.0000000 0.04385662 0.00000000
## q.travel_time -11.417279 9.5232414 19.77243130 8.63506889
## q.tip_amount -25.157719 0.6165013 0.99849250 1.01932503
## q.espeed -30.225967 18.0254818 20.29155358 5.74580135
## q.passenger_count -31.520479 1.0867391 1.37144230 0.29748755
## q.fare_amount -42.340824 8.5700553 11.45424939 3.77250022
## q.target.total_amount -42.868569 10.3007837 13.63611051 4.11001073
## q.trip_distance -44.069029 1.6856248 2.67531897 1.01078264
## Overall sd p.value
## q.dropoff_latitude 0.05477752 6.463721e-06
## q.dropoff_longitude 0.04399989 3.549302e-04
## q.pickup_latitude 0.05497004 1.839022e-03
## q.extra 0.36659461 3.274250e-02
## q.tolls_amount 0.48467599 3.066260e-25
## q.travel_time 102.97446305 3.428003e-30
## q.tip_amount 1.74174533 1.163432e-139
## q.espeed 8.59995255 1.079921e-200
## q.passenger_count 1.03610097 4.553213e-218
## q.fare_amount 7.81389859 0.000000e+00
## q.target.total_amount 8.92487225 0.000000e+00
## q.trip_distance 2.57614388 0.000000e+00
##
## $`2`
## v.test Mean in category Overall mean sd in category
## q.passenger_count 59.636625 4.6435045 1.3714423 1.0571774
## q.extra 4.629278 0.4425982 0.3527300 0.3515583
## q.tip_amount -2.127487 0.8022659 0.9984925 1.2261415
## q.target.total_amount -3.522865 11.9711480 13.6361105 6.1065438
## q.trip_distance -3.632492 2.1797761 2.6753190 1.7638149
## q.fare_amount -3.675137 9.9335347 11.4542494 5.6676689
## Overall sd p.value
## q.passenger_count 1.0361010 0.000000e+00
## q.extra 0.3665946 3.669425e-06
## q.tip_amount 1.7417453 3.337965e-02
## q.target.total_amount 8.9248722 4.269085e-04
## q.trip_distance 2.5761439 2.806972e-04
## q.fare_amount 7.8138986 2.377218e-04
##
## $`3`
## v.test Mean in category Overall mean sd in category
## q.travel_time 67.239191 1400.9360 19.772431 103.3050666
## q.fare_amount 2.980540 16.1000 11.454249 10.3720779
## q.trip_distance 2.882144 4.1564 2.675319 3.5615782
## q.extra 2.560878 0.5400 0.352730 0.3720215
## q.target.total_amount 2.458614 18.0132 13.636111 11.3032309
## q.espeed -10.079636 3.0000 20.291554 0.0000000
## Overall sd p.value
## q.travel_time 102.9744630 0.000000e+00
## q.fare_amount 7.8138986 2.877402e-03
## q.trip_distance 2.5761439 3.949798e-03
## q.extra 0.3665946 1.044079e-02
## q.target.total_amount 8.9248722 1.394746e-02
## q.espeed 8.5999526 6.797725e-24
##
## $`4`
## v.test Mean in category Overall mean sd in category
## q.trip_distance 49.867781 6.806785469 2.67531897 2.92823075
## q.fare_amount 48.263672 23.582613012 11.45424939 8.68392028
## q.target.total_amount 47.895728 27.383266583 13.63611051 9.87743567
## q.espeed 35.309560 30.057232437 20.29155358 10.84819052
## q.tip_amount 28.520676 2.596057572 0.99849250 2.85745519
## q.tolls_amount -2.194663 0.009648166 0.04385662 0.13446467
## q.pickup_longitude -2.581893 -73.938435447 -73.93501246 0.04354721
## q.pickup_latitude -3.482727 40.740640384 40.74679725 0.05696841
## q.passenger_count -4.574268 1.219023780 1.37144230 0.63941349
## q.dropoff_longitude -4.674408 -73.941547436 -73.93493301 0.05130426
## q.dropoff_latitude -4.789105 40.736579091 40.74501576 0.05256389
## q.extra -5.714628 0.285356696 0.35273004 0.32891271
## Overall sd p.value
## q.trip_distance 2.57614388 0.000000e+00
## q.fare_amount 7.81389859 0.000000e+00
## q.target.total_amount 8.92487225 0.000000e+00
## q.espeed 8.59995255 4.189076e-273
## q.tip_amount 1.74174533 6.492136e-179
## q.tolls_amount 0.48467599 2.818777e-02
## q.pickup_longitude 0.04122424 9.826015e-03
## q.pickup_latitude 0.05497004 4.963334e-04
## q.passenger_count 1.03610097 4.778883e-06
## q.dropoff_longitude 0.04399989 2.948034e-06
## q.dropoff_latitude 0.05477752 1.675264e-06
## q.extra 0.36659461 1.099444e-08
##
## $`5`
## v.test Mean in category Overall mean sd in category
## q.tolls_amount 67.336872 5.54000 0.04385662 0.000000
## q.target.total_amount 15.609970 37.09771 13.63611051 12.607038
## q.trip_distance 13.029590 8.32800 2.67531897 4.138739
## q.fare_amount 11.357817 26.39995 11.45424939 10.171810
## q.espeed 9.987695 34.75644 20.29155358 11.521959
## q.tip_amount 9.796571 3.87200 0.99849250 3.613309
## Overall sd p.value
## q.tolls_amount 0.484676 0.000000e+00
## q.target.total_amount 8.924872 6.226587e-55
## q.trip_distance 2.576144 8.305252e-39
## q.fare_amount 7.813899 6.781847e-30
## q.espeed 8.599953 1.725466e-23
## q.tip_amount 1.741745 1.164730e-22
!!! FALTA EXPLICACIÓ CLUSTERS
res.hcpc$desc.ind$para
## Cluster: 1
## 59388 430223 1168622 325136 995462
## 0.3007173 0.3012123 0.3017848 0.3018312 0.3031328
## ------------------------------------------------------------
## Cluster: 2
## 952205 607516 21675 1331493 20669
## 0.4277255 0.4383450 0.4833966 0.5293226 0.5374432
## ------------------------------------------------------------
## Cluster: 3
## 1330280 821975 381123 423307 659831
## 0.4759781 0.5399269 0.9871099 1.1472860 1.1501812
## ------------------------------------------------------------
## Cluster: 4
## 705868 286717 155337 1399118 10891
## 0.3945283 0.4215951 0.4251876 0.4497589 0.4662729
## ------------------------------------------------------------
## Cluster: 5
## 529475 1010826 1016299 101184 1051194
## 0.9615489 1.0742546 1.0896649 1.2422734 1.2586186
What we obtain are the more representative individuals, paragons, for each cluster. We get the rownames of each paragon in every single cluster.
res.hcpc$desc.ind$dist
## Cluster: 1
## 1281722 1254963 675551 641604 985739
## 4.948078 4.250487 4.196297 4.183627 4.182230
## ------------------------------------------------------------
## Cluster: 2
## 169380 1027878 550938 644602 512718
## 5.323835 5.222956 5.216222 5.146947 5.085894
## ------------------------------------------------------------
## Cluster: 3
## 88821 1040346 1060542 1159509 49078
## 14.74079 13.77654 13.76978 13.71268 13.64502
## ------------------------------------------------------------
## Cluster: 4
## 1242754 486866 621544 487457 1354552
## 12.162912 9.292928 9.290145 8.938108 8.863770
## ------------------------------------------------------------
## Cluster: 5
## 710390 194151 1342604 731288 360250
## 14.09537 13.37083 13.07660 12.94008 12.75737
What we obtain are those individuals of each cluster that that far away in the same cluster from the rest of the individuals. We also obtain the rownames of each individual with the bigger distance respect the other ones in the cluster.
We get the grpahical representation for the individuals that characterize classes (para and dist).
para1<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$para[[1]]))
dist1<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$dist[[1]]))
para2<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$para[[2]]))
dist2<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$dist[[2]]))
para3<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$para[[3]]))
dist3<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$dist[[3]]))
para4<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$para[[4]]))
dist4<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$dist[[4]]))
para5<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$para[[5]]))
dist5<-which(rownames(res.pca$ind$coord)%in%names(res.hcpc$desc.ind$dist[[5]]))
plot(res.pca$ind$coord[,1],res.pca$ind$coord[,2],col="grey50",cex=0.5,pch=16)
points(res.pca$ind$coord[para1,1],res.pca$ind$coord[para1,2],col="blue",cex=1,pch=16)
points(res.pca$ind$coord[dist1,1],res.pca$ind$coord[dist1,2],col="chartreuse3",cex=1,pch=16)
points(res.pca$ind$coord[para2,1],res.pca$ind$coord[para2,2],col="blue",cex=1,pch=16)
points(res.pca$ind$coord[dist2,1],res.pca$ind$coord[dist2,2],col="darkorchid3",cex=1,pch=16)
points(res.pca$ind$coord[para3,1],res.pca$ind$coord[para3,2],col="blue",cex=1,pch=16)
points(res.pca$ind$coord[dist3,1],res.pca$ind$coord[dist3,2],col="firebrick3",cex=1,pch=16)
points(res.pca$ind$coord[para4,1],res.pca$ind$coord[para4,2],col="blue",cex=1,pch=16)
points(res.pca$ind$coord[dist4,1],res.pca$ind$coord[dist4,2],col="palevioletred3",cex=1,pch=16)
points(res.pca$ind$coord[para5,1],res.pca$ind$coord[para5,2],col="blue",cex=1,pch=16)
points(res.pca$ind$coord[dist5,1],res.pca$ind$coord[dist5,2],col="royalblue1",cex=1,pch=16)
!! FALTA EXPLICACIÓ
We are going to evaluate the partition quality.
((res.hcpc$call$t$within[1]-res.hcpc$call$t$within[5])/res.hcpc$call$t$within[1])*100
## [1] 62.31156
The quality of this reduction if of 62.31%.
In case we wanted to achieve an 80% of the clustering representativity we would need 9 clusters.
((res.hcpc$call$t$within[1]-res.hcpc$call$t$within[9])/res.hcpc$call$t$within[1])*100
## [1] 80.46785
res.hcpc$call$t$inert.gain[1:5]
## [1] 1.5003375 0.9780995 0.9260693 0.7362581 0.4335335
df$hcpck<-res.hcpc$data.clust$clust
res.pca <- PCA(df[,c(1,2,3,4,5,6,7,8,9,10,12,13,15,16,17,19,21,22,25,26)],quanti.sup=c(3:6,13),quali.sup=c(1,2,14:16,19:20),ncp=5,graph=FALSE)
ppcc<-res.pca$ind$coord[,1:3] ## 3 components principals (kaiser)
dim(ppcc)
## [1] 4597 3
library("factoextra")
# fviz_nbclust(ppcc, kmeans, method = "gap_stat") ## !!!!Descomentar pel deliverable, triga molt.
According to the previous plot, the optimal number of clusters per k-means is 3, so we guess maybe something is wrong or missing.
dist<-dist(ppcc)
kc<-kmeans(dist, 5, iter.max=30, trace=TRUE) #caclulate the distances, it turns into a matrix
## KMNS(*, k=5): iter= 1, indx=0
## QTRAN(): istep=4597, icoun=9
## QTRAN(): istep=9194, icoun=21
## QTRAN(): istep=13791, icoun=3
## QTRAN(): istep=18388, icoun=17
## QTRAN(): istep=22985, icoun=173
## QTRAN(): istep=27582, icoun=49
## QTRAN(): istep=32179, icoun=35
## QTRAN(): istep=36776, icoun=686
## QTRAN(): istep=41373, icoun=875
## QTRAN(): istep=45970, icoun=1724
## QTRAN(): istep=50567, icoun=397
## QTRAN(): istep=55164, icoun=1556
## QTRAN(): istep=59761, icoun=987
## QTRAN(): istep=64358, icoun=143
## QTRAN(): istep=68955, icoun=1224
## QTRAN(): istep=73552, icoun=277
## QTRAN(): istep=78149, icoun=2
## QTRAN(): istep=82746, icoun=131
## QTRAN(): istep=87343, icoun=375
## QTRAN(): istep=91940, icoun=302
## KMNS(*, k=5): iter= 2, indx=16
## QTRAN(): istep=4597, icoun=41
## QTRAN(): istep=9194, icoun=0
## QTRAN(): istep=13791, icoun=27
## QTRAN(): istep=18388, icoun=349
## QTRAN(): istep=22985, icoun=83
## QTRAN(): istep=27582, icoun=1899
## KMNS(*, k=5): iter= 3, indx=212
## QTRAN(): istep=4597, icoun=37
## QTRAN(): istep=9194, icoun=418
## QTRAN(): istep=13791, icoun=284
## QTRAN(): istep=18388, icoun=2091
## QTRAN(): istep=22985, icoun=3028
## KMNS(*, k=5): iter= 4, indx=4597
We see from the output that in 4 iterations it has converged. We now procceed to save in the data frame the number of clusters.
df$claKM<-0
df$claKM<-kc$cluster
df$claKM<-factor(df$claKM)
barplot(table(df$claKM),col="darkslateblue",border="darkslateblue",main="[k-means]#observations/cluster")
The american school does the partition quality evaluation in 5 clusters is done very fast, and after executing the following chunk we get an explicability of the 80.37%
100*(kc$betweenss/kc$totss)
## [1] 80.36938
If we want to know the characteristics of each cluster, as we did with the hierarchical, we need to execute a catdes to obtain these characteristics. In the following output we get them.
dim(df)
## [1] 4597 36
res.cat <-catdes(df,30)
res.cat
##
## Link between the cluster variable and the categorical variables (chi-square test)
## =================================================================================
## p.value df
## hcpck 0.000000e+00 4
## claKM 2.969599e-108 4
## f.dist 1.227202e-39 3
## f.trip_distance_range 1.120390e-36 2
## f.espeed 1.023536e-16 2
## f.payment_type 6.207353e-05 2
## f.target.tip_is_given 3.327123e-04 1
## f.period 1.670625e-02 3
## qual.dropoff 3.446248e-02 23
##
## Description of each cluster by the categories
## =============================================
## $tolls_no
## Cla/Mod Mod/Cla Global
## hcpck=1 100.00000 74.7969265 74.1135523
## f.trip_distance_range=trip_dist_short 100.00000 65.3128430 64.7161192
## f.dist=(0, 1.6] 100.00000 46.4983535 46.0735262
## f.espeed=[03,20) 99.85152 59.0559824 58.6034370
## claKM=4 100.00000 43.0954995 42.7017620
## claKM=2 100.00000 31.7233809 31.4335436
## f.dist=(1.6, 3] 100.00000 25.1152580 24.8857951
## f.payment_type=cash 99.63812 54.4017563 54.1005003
## f.target.tip_is_given=tip_no 99.47717 62.6564215 62.4102676
## claKM=3 99.88038 18.3315038 18.1857733
## hcpck=2 100.00000 7.2667398 7.2003481
## f.hour=other 98.87908 63.9077936 64.0417664
## qual.pickup=09 97.28261 3.9297475 4.0026104
## f.period=period morning 97.96296 11.6136114 11.7467914
## qual.dropoff=09 96.75676 3.9297475 4.0243637
## f.target.tip_is_given=tip_yes 98.43750 37.3435785 37.5897324
## f.espeed=[20,40) 98.41828 36.8825467 37.1329128
## f.payment_type=credit card 98.46228 44.9835346 45.2686535
## f.espeed=[40,55] 94.38776 4.0614709 4.2636502
## claKM=5 80.64516 0.5488474 0.6743528
## f.trip_distance_range=trip_dist_long 94.65409 13.2162459 13.8351099
## f.dist=(5.5, 30] 94.10714 11.5697036 12.1818577
## claKM=1 89.13043 6.3007684 7.0045682
## hcpck=5 0.00000 0.0000000 0.7613661
## p.value v.test
## hcpck=1 1.296248e-25 10.461617
## f.trip_distance_range=trip_dist_short 7.035056e-20 9.127113
## f.dist=(0, 1.6] 4.630049e-12 6.916480
## f.espeed=[03,20) 3.507713e-11 6.623509
## claKM=4 6.036808e-11 6.542823
## claKM=2 1.199840e-07 5.293520
## f.dist=(1.6, 3] 5.664839e-06 4.538528
## f.payment_type=cash 1.747824e-05 4.294887
## f.target.tip_is_given=tip_no 5.031113e-04 3.479094
## claKM=3 2.393235e-03 3.036523
## hcpck=2 4.271649e-02 2.026472
## f.hour=other 4.522426e-02 -2.002563
## qual.pickup=09 3.088826e-02 -2.158509
## f.period=period morning 9.930884e-03 -2.578227
## qual.dropoff=09 7.382495e-03 -2.679079
## f.target.tip_is_given=tip_yes 5.031113e-04 -3.479094
## f.espeed=[20,40) 3.975441e-04 -3.541709
## f.payment_type=credit card 5.206205e-05 -4.046172
## f.espeed=[40,55] 9.507371e-07 -4.901570
## claKM=5 2.567929e-07 -5.152678
## f.trip_distance_range=trip_dist_long 1.165863e-22 -9.796472
## f.dist=(5.5, 30] 4.571378e-23 -9.890631
## claKM=1 1.139133e-34 -12.281467
## hcpck=5 2.065346e-81 -19.110486
##
## $tolls_yes
## Cla/Mod Mod/Cla Global
## hcpck=5 100.0000000 83.333333 0.7613661
## claKM=1 10.8695652 83.333333 7.0045682
## f.dist=(5.5, 30] 5.8928571 78.571429 12.1818577
## f.trip_distance_range=trip_dist_long 5.3459119 80.952381 13.8351099
## claKM=5 19.3548387 14.285714 0.6743528
## f.espeed=[40,55] 5.6122449 26.190476 4.2636502
## f.payment_type=credit card 1.5377222 76.190476 45.2686535
## f.espeed=[20,40) 1.5817223 64.285714 37.1329128
## f.target.tip_is_given=tip_yes 1.5625000 64.285714 37.5897324
## qual.dropoff=09 3.2432432 14.285714 4.0243637
## f.period=period morning 2.0370370 26.190476 11.7467914
## qual.pickup=09 2.7173913 11.904762 4.0026104
## f.hour=other 1.1209239 78.571429 64.0417664
## hcpck=2 0.0000000 0.000000 7.2003481
## claKM=3 0.1196172 2.380952 18.1857733
## f.target.tip_is_given=tip_no 0.5228303 35.714286 62.4102676
## f.payment_type=cash 0.3618818 21.428571 54.1005003
## f.dist=(1.6, 3] 0.0000000 0.000000 24.8857951
## claKM=2 0.0000000 0.000000 31.4335436
## claKM=4 0.0000000 0.000000 42.7017620
## f.espeed=[03,20) 0.1484781 9.523810 58.6034370
## f.dist=(0, 1.6] 0.0000000 0.000000 46.0735262
## f.trip_distance_range=trip_dist_short 0.0000000 0.000000 64.7161192
## hcpck=1 0.0000000 0.000000 74.1135523
## p.value v.test
## hcpck=5 2.065346e-81 19.110486
## claKM=1 1.139133e-34 12.281467
## f.dist=(5.5, 30] 4.571378e-23 9.890631
## f.trip_distance_range=trip_dist_long 1.165863e-22 9.796472
## claKM=5 2.567929e-07 5.152678
## f.espeed=[40,55] 9.507371e-07 4.901570
## f.payment_type=credit card 5.206205e-05 4.046172
## f.espeed=[20,40) 3.975441e-04 3.541709
## f.target.tip_is_given=tip_yes 5.031113e-04 3.479094
## qual.dropoff=09 7.382495e-03 2.679079
## f.period=period morning 9.930884e-03 2.578227
## qual.pickup=09 3.088826e-02 2.158509
## f.hour=other 4.522426e-02 2.002563
## hcpck=2 4.271649e-02 -2.026472
## claKM=3 2.393235e-03 -3.036523
## f.target.tip_is_given=tip_no 5.031113e-04 -3.479094
## f.payment_type=cash 1.747824e-05 -4.294887
## f.dist=(1.6, 3] 5.664839e-06 -4.538528
## claKM=2 1.199840e-07 -5.293520
## claKM=4 6.036808e-11 -6.542823
## f.espeed=[03,20) 3.507713e-11 -6.623509
## f.dist=(0, 1.6] 4.630049e-12 -6.916480
## f.trip_distance_range=trip_dist_short 7.035056e-20 -9.127113
## hcpck=1 1.296248e-25 -10.461617
##
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## q.tolls_amount 0.88798659 0.000000e+00
## q.target.total_amount 0.07543346 2.364785e-80
## q.tlenkm 0.05034083 1.513536e-53
## q.trip_distance 0.04976256 6.164186e-53
## q.fare_amount 0.03972037 2.126989e-42
## q.espeed 0.02674487 6.377097e-29
## q.tip_amount 0.02207282 4.207044e-24
##
## Description of each cluster by quantitative variables
## =====================================================
## $tolls_no
## v.test Mean in category Overall mean sd in category
## q.tip_amount -10.07207 0.9736443 0.99849250 1.694361
## q.espeed -11.08690 20.1565029 20.29155358 8.455435
## q.fare_amount -13.51129 11.3047103 11.45424939 7.612033
## q.trip_distance -15.12312 2.6201364 2.67531897 2.482351
## q.tlenkm -15.21073 4.2071453 4.29604038 3.973971
## q.target.total_amount -18.61967 13.4007333 13.63611051 8.495598
## q.tolls_amount -63.88416 0.0000000 0.04385662 0.000000
## Overall sd p.value
## q.tip_amount 1.741745 7.341303e-24
## q.espeed 8.599953 1.452271e-28
## q.fare_amount 7.813899 1.341590e-41
## q.trip_distance 2.576144 1.140128e-51
## q.tlenkm 4.126073 3.001553e-52
## q.target.total_amount 8.924872 2.225732e-77
## q.tolls_amount 0.484676 0.000000e+00
##
## $tolls_yes
## v.test Mean in category Overall mean sd in category
## q.tolls_amount 63.88416 4.800212 0.04385662 1.697068
## q.target.total_amount 18.61967 39.163333 13.63611051 15.265340
## q.tlenkm 15.21073 13.936919 4.29604038 7.539053
## q.trip_distance 15.12312 8.660000 2.67531897 4.684551
## q.fare_amount 13.51129 27.672125 11.45424939 11.546805
## q.espeed 11.08690 34.938122 20.29155358 11.170182
## q.tip_amount 10.07207 3.693333 0.99849250 3.655469
## Overall sd p.value
## q.tolls_amount 0.484676 0.000000e+00
## q.target.total_amount 8.924872 2.225732e-77
## q.tlenkm 4.126073 3.001553e-52
## q.trip_distance 2.576144 1.140128e-51
## q.fare_amount 7.813899 1.341590e-41
## q.espeed 8.599953 1.452271e-28
## q.tip_amount 1.741745 7.341303e-24
We proceed to explain the data obtained.
We start wit the description of the categorical variables that characterize the clusters, so in this output we do not have dimensions because it is the total association. We can see the intensity of the variables, in our case the variables that affect more to the clustering are Trip_distance_range, paidTolls and hcpck because are the one with the smallest p.value.
Next, we want to see for each cluster which are the categories that characterize them.
!!!!!!!!No heu de deixar de veure la relació dels clusters amb els 2 targets que teniu Total_amount i AnyTip.
!!! FALTA DESCRIPCIÓ CLASSES
We want to compare the hierarchical clustering, previously done, and the k-means clustering, so proceed to do the following.
table(df$hcpck,df$claKM)
##
## 1 2 3 4 5
## 1 0 1445 110 1852 0
## 2 89 0 131 111 0
## 3 1 0 0 0 24
## 4 203 0 595 0 1
## 5 29 0 0 0 6
## we must do a relabel
df$hcpck<-factor(df$hcpck,labels=c("kHP-1","kHP-2","kHP-3","kHP-4","kHP-5"))
df$claKM<-factor(df$claKM,levels=c(2,4,5,3,1),labels=c("kKM-2","kKM-4","kKM-5","kKM-3","kKM-1"))
tt<-table(df$hcpck,df$claKM); tt
##
## kKM-2 kKM-4 kKM-5 kKM-3 kKM-1
## kHP-1 1445 1852 0 110 0
## kHP-2 0 111 0 131 89
## kHP-3 0 0 24 0 1
## kHP-4 0 0 1 595 203
## kHP-5 0 0 6 0 29
100*sum(diag(tt)/sum(tt))
## [1] 47.94431
We have a concordance of the 47.94% so we can say that they are different, if we had a greater concordance, this would mean that they would be more similar.
The first thing we need to do is factor our numeric target variable, Total_amount, and name it f.cost. We are going to set 6 different categories.
df$f.cost[df$q.target.total_amount<=8] = "[0,8]"
df$f.cost[(df$q.target.total_amount>8) & (df$q.target.total_amount<=11)] = "(8,11]"
df$f.cost[(df$q.target.total_amount>11) & (df$q.target.total_amount<=18)] = "(11,18]"
df$f.cost[(df$q.target.total_amount>18) & (df$q.target.total_amount<= 30)] = "(18,30]"
df$f.cost[(df$q.target.total_amount>30) & (df$q.target.total_amount<= 50)] = "(30,50]"
df$f.cost[df$q.target.total_amount>50] = "(50,129)"
df$f.cost<-factor(df$f.cost)
table(df$f.cost)
##
## (11,18] (18,30] (30,50] (50,129) (8,11] [0,8]
## 1188 724 219 43 1150 1273
Once we have this factor, proceed to create a variable that associates the cost with the passenger groups, and we we a contingency table with 5 rows, one per kind of cost and 3 columns, one per each kind of group.
tt<-table(df[,c("f.cost","f.passenger_groups")]);tt
## f.passenger_groups
## f.cost passenger_couple passenger_group passenger_single
## (11,18] 77 89 1022
## (18,30] 58 72 594
## (30,50] 20 19 180
## (50,129) 4 6 33
## (8,11] 81 104 965
## [0,8] 104 103 1066
chisq.test(tt, simulate.p.value = TRUE) #to see if the rows and columns are independents. H0: Rows and columns are independent
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: tt
## X-squared = 10.542, df = NA, p-value = 0.3633
We get a p-value lower than 0.05 so we cannot assume the H0. ( 0.3898 < 0.05 = TRUE).
We are now going to take a look to the simple correspondences.
res.ca <- CA(tt)
Those observations far away from the gravity center will mean that represent less observations on the sample. If rows and columns are nearby, this will mean that there is a correspondence between them, which means that they occur simultaneously in the sample.
summary(res.ca)
##
## Call:
## CA(X = tt)
##
## The chi square of independence between the two variables is equal to 10.54229 (p-value = 0.3942705 ).
##
## Eigenvalues
## Dim.1 Dim.2
## Variance 0.002 0.001
## % of var. 76.167 23.833
## Cumulative % of var. 76.167 100.000
##
## Rows
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## (11,18] | 0.815 | -0.056 45.675 0.979 | -0.008 3.147 0.021 |
## (18,30] | 0.487 | 0.054 26.428 0.948 | -0.013 4.628 0.052 |
## (30,50] | 0.192 | 0.044 5.190 0.472 | 0.046 18.536 0.528 |
## (50,129) | 0.419 | 0.204 22.330 0.931 | -0.056 5.312 0.069 |
## (8,11] | 0.136 | 0.004 0.233 0.030 | -0.023 24.184 0.970 |
## [0,8] | 0.244 | 0.003 0.144 0.010 | 0.030 44.195 0.990 |
##
## Columns
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## passenger_couple | 0.859 | 0.083 29.418 0.598 | 0.068 63.099 0.402 |
## passenger_group | 1.157 | 0.106 54.741 0.827 | -0.048 36.710 0.173 |
## passenger_single | 0.278 | -0.018 15.840 0.996 | -0.001 0.192 0.004 |
We conclude that we can not reject the H0 for these pair of factors, and now we are going to see if we can see if there is independence between the cost and the travel time, so the first thing we are going to do is factor the travel time.
df$f.travel_time[df$q.travel_time<=5] = "[0,5]"
df$f.travel_time[(df$q.travel_time>5) & (df$q.travel_time<=10)] = "(5,10]"
df$f.travel_time[(df$q.travel_time>10) & (df$q.travel_time<=15)] = "(10,15]"
df$f.travel_time[(df$q.travel_time>15) & (df$q.travel_time<= 20)] = "(15,20]"
df$f.travel_time[(df$q.travel_time>20) & (df$q.travel_time<= 70)] = "(20,60]"
df$f.travel_time<-factor(df$f.travel_time)
table(df$f.travel_time)
##
## (10,15] (15,20] (20,60] (5,10] [0,5]
## 913 548 710 1511 885
Once we have this factor, proceed to create a variable that associates the cost with the traveltime.
new_f.cost <- ordered(df$f.cost, levels= c("[0,8]", "(8,11]", "(11,18]", "(18,30]", "(30,50]","(50,129)"))
new_f.travel_time <- ordered(df$f.travel_time, levels= c("[0,5]", "(5,10]", "(10,15]", "(15,20]","(20,60]"))
tt<-table(new_f.cost, new_f.travel_time);tt
## new_f.travel_time
## new_f.cost [0,5] (5,10] (10,15] (15,20] (20,60]
## [0,8] 774 486 3 3 3
## (8,11] 85 864 189 3 4
## (11,18] 8 156 613 314 89
## (18,30] 15 3 106 205 388
## (30,50] 2 2 1 23 189
## (50,129) 1 0 1 0 37
chisq.test(tt) #to see if the rows and columns are independents. H0: Rows and columns are independent
## Warning in chisq.test(tt): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tt
## X-squared = 6210.8, df = 20, p-value < 2.2e-16
We get a p-value smaller than 0.05 so we can reject the H0. ((< 2.2e-16) < 0.05). So there is dependence between the traveltime and the cost, as we suspected.
We are now going to take a look to the simple correspondences.
res.ca <- CA(tt)
plot(res.ca$row$coord[,1],res.ca$row$coord[,2],pch=19,col="blue",xlim=c(-1.5,1.5),ylim=c(-1.5,1.5),xlab="Axis 1",ylab="Axis 2", main="CA f.cost vs f.travel_time")
points(res.ca$col$coord[,1],res.ca$col$coord[,2],lwd=2,col="red")
text(res.ca$row$coord[,1],res.ca$row$coord[,2],lwd=2,col="blue",labels=levels(df$f.cost))
text(res.ca$col$coord[,1],res.ca$col$coord[,2],lwd=2,col="red",labels=levels(df$f.travel_time))
lines(res.ca$row$coord[,1],res.ca$row$coord[,2],lwd=2,col="blue")
lines(res.ca$col$coord[,1],res.ca$col$coord[,2],lwd=2,col="red")
We can see in the plot, clearly that there are some categories that occur simultaneously in the sample, for instant the trips up to 5 minutes with the cost up to 8, the trips between 5-10 minutes and the costs between 8-11, the same happen with the trips between 10-15 minutes and the costs between 11-18. There is a clear relation between the f.cost and f.travel_time categories.
summary(res.ca)
##
## Call:
## CA(X = tt)
##
## The chi square of independence between the two variables is equal to 6210.782 (p-value = 0 ).
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4
## Variance 0.764 0.392 0.194 0.010
## % of var. 56.176 28.797 14.281 0.746
## Cumulative % of var. 56.176 84.973 99.254 100.000
##
## Rows
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## [0,8] | 378.779 | -0.985 35.294 0.712 | 0.518 19.061 0.197 |
## (8,11] | 222.099 | -0.554 10.079 0.347 | -0.405 10.481 0.185 |
## (11,18] | 266.057 | 0.569 10.949 0.314 | -0.747 36.819 0.542 |
## (18,30] | 263.211 | 1.183 28.741 0.834 | 0.500 10.025 0.149 |
## (30,50] | 188.821 | 1.426 12.652 0.512 | 1.259 19.223 0.399 |
## (50,129) | 40.958 | 1.430 2.286 0.426 | 1.419 4.392 0.420 |
## Dim.3 ctr cos2
## [0,8] 0.352 17.761 0.091 |
## (8,11] -0.644 53.537 0.468 |
## (11,18] 0.381 19.325 0.141 |
## (18,30] -0.020 0.032 0.000 |
## (30,50] -0.550 7.396 0.076 |
## (50,129) -0.666 1.949 0.092 |
##
## Columns
## Iner*1000 Dim.1 ctr cos2 Dim.2 ctr cos2
## [0,5] | 347.244 | -1.012 25.989 0.572 | 0.672 22.354 0.252 |
## (5,10] | 237.476 | -0.653 18.466 0.594 | -0.222 4.175 0.069 |
## (10,15] | 200.256 | 0.463 5.604 0.214 | -0.835 35.605 0.696 |
## (15,20] | 143.916 | 0.938 13.819 0.734 | -0.300 2.751 0.075 |
## (20,60] | 431.034 | 1.332 36.121 0.640 | 0.941 35.115 0.319 |
## Dim.3 ctr cos2
## [0,5] 0.561 31.447 0.176 |
## (5,10] -0.491 41.051 0.336 |
## (10,15] 0.273 7.646 0.074 |
## (15,20] 0.423 11.042 0.149 |
## (20,60] -0.332 8.815 0.040 |
The first thing we can see from the summary is that we have a chi square statistic of 6210.782, great enough to reject the H0, which means the intensity of the relation is high. If we take a look at the variances from the different dimensions, we can see that all together sum more than 1.
mean(res.ca$eig[,1])
## [1] 0.3399815
Following the kaiser kriteria and the value got in the output, we should retain dimensions with a variance greater than 0.3399815. In this case, the first dimension fulfills this because its variance is 0.764, but it is not enough to work with data so, we would choose 2 o 3 dimensions for this case.
The Multiple correspondence analysis (MCA) is an extension of the simple correspondence analysis for summarizing and visualizing a data table containing more than two categorical variables.
MCA is generally used to analyse a data set from survey. The goal is to identify:
First, we load the libraries we’ll use:
library(FactoMineR)
library(factoextra)
Now, we can start computing the MCA for our categorical variables:
names(df)
## [1] "f.vendor_id" "f.rate_code_id"
## [3] "q.pickup_longitude" "q.pickup_latitude"
## [5] "q.dropoff_longitude" "q.dropoff_latitude"
## [7] "q.passenger_count" "q.trip_distance"
## [9] "q.fare_amount" "q.extra"
## [11] "q.mta_tax" "q.tip_amount"
## [13] "q.tolls_amount" "q.improvement_surcharge"
## [15] "q.target.total_amount" "f.payment_type"
## [17] "f.trip_type" "q.hour"
## [19] "f.period" "q.tlenkm"
## [21] "q.travel_time" "q.espeed"
## [23] "qual.pickup" "qual.dropoff"
## [25] "f.trip_distance_range" "f.passenger_groups"
## [27] "f.extra" "f.mta_tax"
## [29] "f.target.tip_is_given" "f.paid_tolls"
## [31] "f.improvement_surcharge" "f.dist"
## [33] "f.hour" "f.espeed"
## [35] "hcpck" "claKM"
## [37] "f.cost" "f.travel_time"
vars_con <- names(df)[c(10,11,14,15,18,20)]; length(vars_con)
## [1] 6
vars_dis <- names(df)[c(1,2,16,17,19,25,16,27,28,29,30,31,32,33,34,37,38)]; length(vars_dis)
## [1] 17
res.mca <- MCA(
df[,c(vars_dis,vars_con)],
quanti.sup=c(18:23),
quali.sup=c(10,16),
graph=FALSE
)
Let’s look at the supplementary quantitative variable q.target.total_amount. We can see that it is closer to the Dim2 than to the Dim1.
fviz_mca_var(res.mca, choice="quanti.sup", repel=TRUE, ggtheme=theme_minimal())
We can see that improvement subcharge and mta_tax are moving to the -1 value of dimension 2, while, for example, total amount and trip distance are moving to the 1 value of dimension 1. On the other hand, extra and hour are moving negatively towards both dimensions.
Cloud of individuals:
fviz_mca_ind(res.mca,geom=c("point"),col.ind="darkslateblue")
How many axes we have to consider for next Hierarchical Classification stage?
We consider, according to the generalized Kaiser theorem, all those dimensions such that their eigenvalue is greater than the mean. We see that the average gives us 0.06666667. Therefore, we will take up to dimension 13, which represents the 73.66% of the sample.
mean(res.mca$eig[,1])
## [1] 0.06666667
head(get_eigenvalue(res.mca), 15)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 0.26384657 12.367808 12.36781
## Dim.2 0.20301756 9.516448 21.88426
## Dim.3 0.14923427 6.995357 28.87961
## Dim.4 0.13827820 6.481791 35.36140
## Dim.5 0.13626758 6.387543 41.74895
## Dim.6 0.12472129 5.846311 47.59526
## Dim.7 0.12377231 5.801827 53.39708
## Dim.8 0.08746709 4.100020 57.49710
## Dim.9 0.07261661 3.403903 60.90101
## Dim.10 0.06900216 3.234476 64.13548
## Dim.11 0.06868199 3.219468 67.35495
## Dim.12 0.06778477 3.177411 70.53236
## Dim.13 0.06671884 3.127446 73.65981
## Dim.14 0.06641956 3.113417 76.77323
## Dim.15 0.06565171 3.077424 79.85065
We can also visualize the percentages of inertia explained by each MCA dimensions:
fviz_screeplot(res.mca,addlabels=TRUE,ylim=c(0,15),barfill="darkslateblue",barcolor="darkslateblue",linecolor="skyblue1")
Are they any individuals “too contributive”?
fviz_mca_ind(res.mca,geom=c("point"),col.ind="contrib",gradient.cols=c("darkslateblue", "red"))
Are there any groups?
fviz_mca_ind(res.mca, label="none", habillage="f.vendor_id", palette=c("darkslateblue", "red"))
fviz_mca_ind(res.mca, label="none", habillage="f.rate_code_id", palette=c("darkslateblue", "red"))
fviz_mca_ind(res.mca, label="none", habillage="f.trip_type", palette=c("darkslateblue", "red"))
We can see that individuals are more grouped according to some variables than others. For example, the vendor_id_mobile is along the entire dimension 1 but also in the center of gravity. In contrast, the rate_code_id_other-Other is only in the first dimension and does not touch the second at all.
Before looking at the categories, let’s look at its variables:
As we can see in the plot “Variables representation”, the correlation between the f.dist factor taking into account the eta2 and the second factorial axis is a value greater than 0.5. On the other hand, we can see that something similar happens with the f.extra factor and f.rate_code_id in dimension 1.
fviz_mca_var(res.mca, choice="mca.cor", repel=TRUE)
Now, let’s analyze the categories.
fviz_mca_var(res.mca, repel=TRUE)
As we can see, the “tolls_yes” category (f.paid_tolls variable) is the one farthest from the center of the plot (in dimension 2). The farther from the center of gravity, the more rarely this feature value appears in the sample represented by the dimension.
In addition, we see that in dimension 1 we also have two extremes, the “code_rate_id_other” category (“f.code_rate_id” variable) and the “trip_dispatch” category (“f.trip_type” variable), as well as other variables. As we have said, this means that these categories are rarely represented in this dimension.
Regardering the center of mass, we can say that we find the categories most represented by the dimensions.
To give an example, let’s suppose we look at the first dimension. An observation that we could find with high probability would be the following:
On the other hand, an observation that we could rarely find there would be…
We would follow the same logic for dimension 2 considering the f.payment_type variable.
res.desc <- dimdesc(res.mca, axes = c(1,2))
res.desc[[1]]
## $quanti
## correlation p.value
## q.tlenkm 0.17457064 8.816040e-33
## q.target.total_amount 0.15634650 1.512516e-26
## q.hour -0.08913004 1.417066e-09
## q.extra -0.24101919 9.409191e-62
## q.improvement_surcharge -0.96051595 0.000000e+00
## q.mta_tax -0.97117133 0.000000e+00
##
## $quali
## R2 p.value
## f.rate_code_id 0.919151779 0.000000e+00
## f.trip_type 0.940381326 0.000000e+00
## f.mta_tax 0.943173751 0.000000e+00
## f.improvement_surcharge 0.922590885 0.000000e+00
## f.extra 0.066345214 1.450901e-70
## f.dist 0.047418367 4.186288e-48
## f.trip_distance_range 0.038289486 1.129547e-39
## f.espeed 0.029732943 7.749701e-31
## f.cost 0.030057264 1.658596e-28
## f.travel_time 0.021783318 3.031007e-20
## f.period 0.012989156 5.716894e-13
## f.hour 0.012730127 7.770353e-11
## f.target.tip_is_given 0.002898345 2.603895e-04
## f.paid_tolls 0.002054556 2.112217e-03
##
## $category
## Estimate p.value
## f.improvement_surcharge=improvement_no 1.607065929 0.000000e+00
## f.mta_tax=mta_no 1.617803817 0.000000e+00
## f.trip_type=trip_dispatch 1.667084881 0.000000e+00
## f.rate_code_id=rate_code_id_other 1.576627175 0.000000e+00
## f.extra=extra_no 0.132743995 1.450901e-70
## f.trip_distance_range=trip_dist_long 0.170583302 7.482436e-37
## f.dist=(5.5, 30] 0.183274270 8.109149e-33
## f.cost=(18,30] 0.038529287 7.007220e-15
## f.hour=other 0.098964114 2.091317e-13
## f.travel_time=(20,60] 0.119708473 2.037710e-12
## f.espeed=[40,55] 0.162433425 3.378535e-12
## f.dist=(3, 5.5] 0.044207557 1.335210e-09
## f.period=period morning 0.098686219 3.605183e-08
## f.cost=(30,50] 0.085038152 6.861480e-08
## f.cost=(50,129) 0.299016115 3.560325e-07
## f.target.tip_is_given=tip_no 0.028546846 2.603895e-04
## f.travel_time=(15,20] 0.065995411 6.007228e-04
## f.paid_tolls=tolls_yes 0.122351814 2.112217e-03
## f.period=period valley 0.018306035 6.181552e-03
## f.hour=17 -0.029183800 6.020625e-03
## f.hour=21 -0.035227703 4.538762e-03
## f.paid_tolls=tolls_no -0.122351814 2.112217e-03
## f.hour=20 -0.035944825 1.061618e-03
## f.cost=[0,8] -0.140333251 4.590942e-04
## f.hour=18 -0.043005175 3.553376e-04
## f.target.tip_is_given=tip_yes -0.028546846 2.603895e-04
## f.cost=(8,11] -0.172634212 9.460386e-09
## f.period=period afternoon -0.093792233 1.437674e-09
## f.travel_time=(5,10] -0.090631815 1.615818e-15
## f.espeed=[20,40) -0.006491457 2.798744e-16
## f.trip_distance_range=trip_dist_short -0.125210848 1.706098e-25
## f.dist=(0, 1.6] -0.145326321 5.031659e-27
## f.espeed=[03,20) -0.155941968 9.783312e-28
## f.extra=extra_yes -0.132743995 1.450901e-70
## f.improvement_surcharge=improvement_yes -1.607065929 0.000000e+00
## f.mta_tax=mta_yes -1.617803817 0.000000e+00
## f.trip_type=trip_street_hail -1.667084881 0.000000e+00
## f.rate_code_id=rate_code_id_1 -1.576627175 0.000000e+00
##
## attr(,"class")
## [1] "condes" "list "
There is no info for the quantitative variables here.
In the first dimension we see that for the qualitative variables the most positively related, from more to less, are:
If we look at the categories, we see that the most related are,
res.desc[[2]]
## $quanti
## correlation p.value
## q.target.total_amount 0.84215186 0.000000e+00
## q.tlenkm 0.83818442 0.000000e+00
## q.improvement_surcharge 0.15701197 9.216823e-27
## q.mta_tax 0.14784473 7.000928e-24
## q.extra -0.06717325 5.156047e-06
## q.hour -0.10798031 2.125227e-13
##
## $quali
## R2 p.value
## f.trip_distance_range 0.80582681 0.000000e+00
## f.dist 0.80742833 0.000000e+00
## f.cost 0.76591458 0.000000e+00
## f.travel_time 0.60322014 0.000000e+00
## f.espeed 0.27298764 9.151825e-319
## f.payment_type 0.17080693 1.419248e-187
## f.payment_type.1 0.17080693 1.419248e-187
## f.target.tip_is_given 0.14113576 4.863688e-154
## f.paid_tolls 0.06948907 6.111645e-74
## f.period 0.03078810 6.177644e-31
## f.improvement_surcharge 0.02465276 9.216823e-27
## f.trip_type 0.02426577 2.310845e-26
## f.mta_tax 0.02185806 7.000928e-24
## f.hour 0.02518104 6.582648e-23
## f.rate_code_id 0.01685847 9.713490e-19
##
## $category
## Estimate p.value
## f.travel_time=(20,60] 0.54429712 0.000000e+00
## f.cost=(18,30] 0.28433045 0.000000e+00
## f.dist=(5.5, 30] 0.63575077 0.000000e+00
## f.trip_distance_range=trip_dist_long 0.53578103 0.000000e+00
## f.cost=(30,50] 0.56485567 1.378657e-212
## f.payment_type.1=f.payment_type.1_credit card 0.08507066 2.190693e-180
## f.payment_type=f.payment_type_credit card 0.08507066 2.190693e-180
## f.dist=(3, 5.5] 0.18888889 4.764268e-174
## f.espeed=[40,55] 0.54626703 1.234667e-161
## f.target.tip_is_given=tip_yes 0.17474015 4.863688e-154
## f.trip_distance_range=trip_dist_medium 0.02614347 2.257960e-140
## f.travel_time=(15,20] 0.22991541 9.547471e-77
## f.paid_tolls=tolls_yes 0.62416688 6.111645e-74
## f.cost=(50,129) 0.63371293 1.747716e-44
## f.improvement_surcharge=improvement_yes 0.23043739 9.216823e-27
## f.trip_type=trip_street_hail 0.23490585 2.310845e-26
## f.mta_tax=mta_yes 0.21603639 7.000928e-24
## f.rate_code_id=rate_code_id_1 0.18729904 9.713490e-19
## f.hour=other 0.09608762 8.365243e-18
## f.period=period night 0.05371230 5.375736e-12
## f.period=period morning 0.09409668 2.027004e-08
## f.payment_type.1=f.payment_type.1_no paid 0.20149678 1.448902e-04
## f.payment_type=f.payment_type_no paid 0.20149678 1.448902e-04
## f.hour=19 -0.02100842 2.718281e-03
## f.hour=17 -0.06388299 2.175150e-05
## f.travel_time=(10,15] -0.03486973 1.553800e-06
## f.hour=20 -0.08567284 2.562629e-08
## f.hour=18 -0.09287469 2.833208e-09
## f.cost=(11,18] -0.23777551 1.726809e-11
## f.dist=(1.6, 3] -0.29072475 1.327600e-12
## f.rate_code_id=rate_code_id_other -0.18729904 9.713490e-19
## f.mta_tax=mta_no -0.21603639 7.000928e-24
## f.period=period afternoon -0.12736706 3.940661e-26
## f.trip_type=trip_dispatch -0.23490585 2.310845e-26
## f.improvement_surcharge=improvement_no -0.23043739 9.216823e-27
## f.paid_tolls=tolls_no -0.62416688 6.111645e-74
## f.espeed=[20,40) -0.10583525 6.585328e-88
## f.cost=(8,11] -0.57133758 9.267624e-118
## f.travel_time=[0,5] -0.41767971 5.786395e-129
## f.target.tip_is_given=tip_no -0.17474015 4.863688e-154
## f.payment_type.1=f.payment_type.1_cash -0.28656744 1.163404e-188
## f.payment_type=f.payment_type_cash -0.28656744 1.163404e-188
## f.travel_time=(5,10] -0.37551168 9.027279e-206
## f.espeed=[03,20) -0.44043178 4.150395e-216
## f.cost=[0,8] -0.67378597 7.137444e-283
## f.dist=(0, 1.6] -0.53391490 0.000000e+00
## f.trip_distance_range=trip_dist_short -0.56192450 0.000000e+00
##
## attr(,"class")
## [1] "condes" "list "
There is no info for the quantitative variables here.
For the second dimension we see that for the qualitative variables the most positively related, from more to less, are:
We see that they are not very large numbers, however.
If we look at the categories, we see that the most related are,
names(df[,c(1:38)])
## [1] "f.vendor_id" "f.rate_code_id"
## [3] "q.pickup_longitude" "q.pickup_latitude"
## [5] "q.dropoff_longitude" "q.dropoff_latitude"
## [7] "q.passenger_count" "q.trip_distance"
## [9] "q.fare_amount" "q.extra"
## [11] "q.mta_tax" "q.tip_amount"
## [13] "q.tolls_amount" "q.improvement_surcharge"
## [15] "q.target.total_amount" "f.payment_type"
## [17] "f.trip_type" "q.hour"
## [19] "f.period" "q.tlenkm"
## [21] "q.travel_time" "q.espeed"
## [23] "qual.pickup" "qual.dropoff"
## [25] "f.trip_distance_range" "f.passenger_groups"
## [27] "f.extra" "f.mta_tax"
## [29] "f.target.tip_is_given" "f.paid_tolls"
## [31] "f.improvement_surcharge" "f.dist"
## [33] "f.hour" "f.espeed"
## [35] "hcpck" "claKM"
## [37] "f.cost" "f.travel_time"
res.mca_all <- MCA(
df[,c(1:38)],
quanti.sup=c(3,4,5,6,7,8,9,10,11,12,13,14,15,18,20,21,22),
quali.sup=c(26,37),
graph=FALSE
)
res.desc <- dimdesc(res.mca_all, axes = c(1,2))
res.desc[[1]]
## $quanti
## correlation p.value
## q.trip_distance 0.53263364 0.000000e+00
## q.fare_amount 0.53136451 0.000000e+00
## q.tlenkm 0.51800587 4.172064e-314
## q.target.total_amount 0.48997755 2.799762e-276
## q.espeed 0.34026621 5.746093e-125
## q.tip_amount 0.23819496 2.578180e-60
## q.tolls_amount 0.17395487 1.471058e-32
## q.pickup_latitude 0.07840060 1.024064e-07
## q.dropoff_latitude 0.06392553 1.441229e-05
## q.travel_time 0.03553830 1.596807e-02
## q.dropoff_longitude -0.06642238 6.566741e-06
## q.hour -0.37039423 1.838382e-149
## q.improvement_surcharge -0.37570314 4.646118e-154
## q.mta_tax -0.38317089 1.129402e-160
## q.extra -0.62829962 0.000000e+00
##
## $quali
## R2 p.value
## f.period 0.475926312 0.000000e+00
## qual.pickup 0.570138464 0.000000e+00
## qual.dropoff 0.554006489 0.000000e+00
## f.trip_distance_range 0.297157633 0.000000e+00
## f.extra 0.431238767 0.000000e+00
## f.dist 0.313582902 0.000000e+00
## f.hour 0.410646532 0.000000e+00
## hcpck 0.301588449 0.000000e+00
## claKM 0.550056124 0.000000e+00
## f.cost 0.271113602 6.370890e-312
## f.travel_time 0.201005058 1.458117e-220
## f.rate_code_id 0.151069937 1.158449e-165
## f.mta_tax 0.146819928 1.129402e-160
## f.trip_type 0.143972949 2.404801e-157
## f.improvement_surcharge 0.141152848 4.646118e-154
## f.espeed 0.110973170 4.544131e-118
## f.paid_tolls 0.032816069 3.289149e-35
## f.payment_type 0.011747415 1.628177e-12
## f.target.tip_is_given 0.008902662 1.456429e-10
##
## $category
## Estimate p.value
## f.hour=f.hour_other 0.5642223317 0.000000e+00
## f.extra=extra_no 0.3329791331 0.000000e+00
## hcpck=kHP-4 0.3095485923 2.375263e-308
## f.trip_distance_range=trip_dist_long 0.4386776525 3.091666e-286
## f.dist=(5.5, 30] 0.5076041966 2.987186e-262
## f.travel_time=(20,60] 0.4101756929 6.203796e-172
## f.period=period valley 0.2479052985 4.987184e-171
## f.period=period morning 0.4679327535 2.065347e-170
## f.rate_code_id=rate_code_id_other 0.6288867598 1.158449e-165
## f.mta_tax=mta_no 0.6280157608 1.129402e-160
## f.trip_type=trip_dispatch 0.6417918545 2.404801e-157
## f.improvement_surcharge=improvement_no 0.6184747477 4.646118e-154
## claKM=kKM-3 0.2060927391 1.484715e-139
## f.cost=(18,30] 0.1574430702 7.952456e-129
## claKM=kKM-2 0.0611295379 1.641965e-110
## f.cost=(30,50] 0.4075498618 2.585431e-89
## claKM=kKM-1 0.3273843398 2.490957e-80
## f.espeed=[40,55] 0.4341932358 2.045776e-77
## qual.dropoff=qual.dropoff_09 0.5206060792 3.527235e-62
## qual.pickup=qual.pickup_09 0.4873399138 5.218506e-55
## qual.pickup=qual.pickup_10 0.4633844041 3.646431e-52
## qual.dropoff=qual.dropoff_10 0.4489272175 1.702829e-48
## qual.dropoff=qual.dropoff_11 0.4272577101 6.979807e-45
## qual.pickup=qual.pickup_08 0.4582747983 1.421497e-44
## f.dist=(3, 5.5] 0.0798383338 8.549770e-42
## f.paid_tolls=tolls_yes 0.4811081501 3.289149e-35
## qual.pickup=qual.pickup_13 0.3869145611 6.205687e-35
## qual.dropoff=qual.dropoff_13 0.3779304006 6.399300e-35
## qual.pickup=qual.pickup_11 0.3827948763 2.466544e-33
## qual.dropoff=qual.dropoff_15 0.3133715534 3.645156e-32
## hcpck=kHP-5 0.7364199684 6.675790e-32
## qual.pickup=qual.pickup_14 0.2923558665 7.259745e-30
## qual.dropoff=qual.dropoff_08 0.3743437264 1.430755e-29
## qual.pickup=qual.pickup_12 0.3237103406 2.033021e-27
## qual.dropoff=qual.dropoff_14 0.2787546880 1.121906e-26
## qual.pickup=qual.pickup_15 0.2717641191 1.597214e-26
## f.cost=(50,129) 0.5666629418 1.598760e-26
## qual.dropoff=qual.dropoff_12 0.3182948434 3.484426e-25
## f.hour=f.hour_22 0.0215493704 2.179804e-22
## qual.pickup=qual.pickup_06 0.4169700268 1.873719e-13
## f.travel_time=(15,20] 0.0896536455 3.360269e-13
## qual.pickup=qual.pickup_07 0.2205245296 8.666712e-11
## f.target.tip_is_given=tip_yes 0.0492256061 1.456429e-10
## qual.dropoff=qual.dropoff_07 0.2116667491 3.298940e-09
## qual.dropoff=qual.dropoff_06 0.3679791192 9.930149e-09
## qual.dropoff=qual.dropoff_16 0.0199691003 1.002024e-03
## qual.pickup=qual.pickup_05 0.1345777147 3.423808e-03
## qual.dropoff=qual.dropoff_05 0.1184692575 5.641972e-03
## claKM=kKM-5 0.0095805870 3.781363e-02
## qual.pickup=qual.pickup_16 -0.0108517278 2.645258e-02
## f.cost=(11,18] -0.2910709054 3.671583e-04
## qual.pickup=qual.pickup_00 -0.1974951385 2.353581e-04
## qual.pickup=qual.pickup_23 -0.2193709920 2.896184e-05
## qual.dropoff=qual.dropoff_00 -0.2158478525 2.438928e-05
## qual.dropoff=qual.dropoff_23 -0.2246208712 6.895710e-06
## hcpck=kHP-2 -0.3839791153 1.929391e-06
## f.target.tip_is_given=tip_no -0.0492256061 1.456429e-10
## f.payment_type=credit card -0.0007651156 2.584388e-12
## f.payment_type=cash -0.1078145260 3.440356e-13
## f.trip_distance_range=trip_dist_medium -0.0792105292 1.213534e-14
## f.dist=(1.6, 3] -0.2508750209 3.715169e-17
## f.travel_time=[0,5] -0.1927378476 5.679657e-19
## qual.dropoff=qual.dropoff_17 -0.3689760215 2.096554e-21
## qual.pickup=qual.pickup_22 -0.3791978473 2.179804e-22
## f.period=period night -0.1772817876 2.133618e-23
## f.espeed=[20,40) -0.1134269758 2.562501e-24
## qual.dropoff=qual.dropoff_22 -0.4125440180 3.743103e-26
## f.cost=[0,8] -0.3779649242 3.173708e-28
## f.hour=f.hour_21 -0.0527787900 1.999349e-32
## qual.pickup=qual.pickup_21 -0.4535260078 1.999349e-32
## f.paid_tolls=tolls_no -0.4811081501 3.289149e-35
## f.hour=f.hour_17 -0.0704859474 6.798281e-38
## qual.pickup=qual.pickup_17 -0.4712331651 6.798281e-38
## qual.dropoff=qual.dropoff_21 -0.4841582760 3.415373e-42
## f.hour=f.hour_19 -0.0574564004 9.301417e-44
## qual.pickup=qual.pickup_19 -0.4582036181 9.301417e-44
## qual.dropoff=qual.dropoff_19 -0.4788810554 7.398448e-49
## qual.dropoff=qual.dropoff_18 -0.5293547756 1.453545e-61
## f.hour=f.hour_18 -0.1319709167 4.078017e-63
## qual.pickup=qual.pickup_18 -0.5327181344 4.078017e-63
## f.cost=(8,11] -0.4626200442 4.242677e-65
## f.espeed=[03,20) -0.3207662600 2.869434e-70
## f.travel_time=(5,10] -0.2436019576 1.175030e-70
## qual.dropoff=qual.dropoff_20 -0.6434861351 1.511503e-90
## f.hour=f.hour_20 -0.2730796476 4.836303e-106
## qual.pickup=qual.pickup_20 -0.6738268654 4.836303e-106
## f.dist=(0, 1.6] -0.3365675095 5.315171e-137
## f.improvement_surcharge=improvement_yes -0.6184747477 4.646118e-154
## f.trip_type=trip_street_hail -0.6417918545 2.404801e-157
## f.mta_tax=mta_yes -0.6280157608 1.129402e-160
## f.rate_code_id=rate_code_id_1 -0.6288867598 1.158449e-165
## hcpck=kHP-1 -0.3872078703 1.158910e-213
## f.trip_distance_range=trip_dist_short -0.3594671233 4.667936e-236
## claKM=kKM-4 -0.6041872038 0.000000e+00
## f.extra=extra_yes -0.3329791331 0.000000e+00
## f.period=period afternoon -0.5385562644 0.000000e+00
##
## attr(,"class")
## [1] "condes" "list "
In this dimension, since we have taken into account all the variables, we now have information for the quantitative variables. We see that, more or less, the most related are:
If we look at the categories, we see that the most related are,
res.desc[[2]]
## $quanti
## correlation p.value
## q.trip_distance 0.64383966 0.000000e+00
## q.tlenkm 0.64273304 0.000000e+00
## q.target.total_amount 0.63306027 0.000000e+00
## q.fare_amount 0.59446156 0.000000e+00
## q.espeed 0.51625418 1.228632e-311
## q.tip_amount 0.40550094 1.646892e-181
## q.extra 0.39234247 4.891855e-169
## q.tolls_amount 0.17839828 3.504139e-34
## q.passenger_count 0.13814512 5.008849e-21
## q.travel_time 0.09198370 4.153248e-10
## q.hour 0.05145983 4.822964e-04
## q.improvement_surcharge 0.03379315 2.194916e-02
## q.pickup_latitude -0.15157850 4.936687e-25
## q.dropoff_latitude -0.16523035 1.697628e-29
##
## $quali
## R2 p.value
## f.period 0.3956288348 0.000000e+00
## qual.pickup 0.4497408628 0.000000e+00
## qual.dropoff 0.4541554265 0.000000e+00
## f.trip_distance_range 0.4677644156 0.000000e+00
## f.extra 0.3202538147 0.000000e+00
## f.dist 0.4694364650 0.000000e+00
## hcpck 0.4491374539 0.000000e+00
## claKM 0.7166058242 0.000000e+00
## f.cost 0.4265269270 0.000000e+00
## f.espeed 0.2417939055 7.436378e-277
## f.travel_time 0.2343012598 6.739182e-263
## f.hour 0.1570264477 3.614870e-166
## f.target.tip_is_given 0.0667293401 5.620747e-71
## f.payment_type 0.0609271154 1.951163e-63
## f.paid_tolls 0.0372766297 7.540123e-40
## f.passenger_groups 0.0205219654 2.064564e-21
## f.improvement_surcharge 0.0011419768 2.194916e-02
## f.trip_type 0.0008648682 4.617068e-02
##
## $category
## Estimate p.value
## hcpck=kHP-4 0.26255958 0.000000e+00
## f.dist=(5.5, 30] 0.59673706 0.000000e+00
## f.extra=extra_yes 0.27704085 0.000000e+00
## f.trip_distance_range=trip_dist_long 0.51551902 0.000000e+00
## f.period=period night 0.41145671 1.855167e-317
## claKM=kKM-3 0.24738852 5.904931e-259
## f.cost=(18,30] 0.22950087 3.605765e-221
## f.travel_time=(20,60] 0.35145490 2.774938e-174
## claKM=kKM-1 0.45385858 5.767408e-170
## f.cost=(30,50] 0.49367395 4.016015e-133
## f.espeed=[40,55] 0.50853136 1.762037e-119
## f.target.tip_is_given=tip_yes 0.13011522 5.620747e-71
## f.dist=(3, 5.5] 0.10569735 2.650785e-67
## f.payment_type=credit card 0.15304790 9.363567e-65
## f.hour=f.hour_22 0.33250991 3.036245e-63
## qual.pickup=qual.pickup_22 0.47926655 3.036245e-63
## qual.dropoff=qual.dropoff_21 0.44688511 2.338317e-59
## f.hour=f.hour_21 0.32757758 4.386865e-59
## qual.pickup=qual.pickup_21 0.47433422 4.386865e-59
## qual.dropoff=qual.dropoff_22 0.45081015 4.318334e-54
## f.paid_tolls=tolls_yes 0.49505825 7.540123e-40
## qual.dropoff=qual.dropoff_23 0.35565312 1.043534e-32
## hcpck=kHP-5 0.57155995 8.077339e-32
## f.cost=(50,129) 0.55214551 4.462805e-29
## f.travel_time=(15,20] 0.10318867 3.130305e-26
## qual.dropoff=qual.dropoff_01 0.35661270 3.301842e-24
## qual.pickup=qual.pickup_01 0.35822811 7.823119e-24
## qual.pickup=qual.pickup_00 0.28501276 7.311138e-22
## qual.pickup=qual.pickup_23 0.28975596 5.305565e-21
## f.passenger_groups=passenger_group 0.13654479 1.360975e-20
## qual.dropoff=qual.dropoff_00 0.26610056 1.735299e-19
## qual.pickup=qual.pickup_03 0.34402483 1.863267e-16
## qual.dropoff=qual.dropoff_03 0.32493235 2.838061e-15
## f.hour=f.hour_20 0.04523078 2.890185e-15
## qual.pickup=qual.pickup_20 0.19198742 2.890185e-15
## qual.dropoff=qual.dropoff_05 0.50903412 4.992204e-15
## qual.pickup=qual.pickup_05 0.49419098 9.000559e-14
## qual.dropoff=qual.dropoff_02 0.29003509 2.176065e-13
## qual.dropoff=qual.dropoff_20 0.17546116 1.187881e-12
## qual.dropoff=qual.dropoff_04 0.29927505 7.076961e-12
## qual.pickup=qual.pickup_02 0.25497228 3.152283e-11
## f.period=period afternoon 0.14461917 7.608595e-10
## claKM=kKM-5 0.26267647 5.640938e-09
## qual.pickup=qual.pickup_04 0.26598694 1.951568e-08
## f.travel_time=f.travel_time.NA 0.22576324 2.131580e-04
## qual.pickup=qual.pickup_19 0.06242889 1.862563e-03
## qual.pickup=qual.pickup_06 0.13380490 1.672273e-02
## qual.dropoff=qual.dropoff_06 0.15616768 1.797863e-02
## f.improvement_surcharge=improvement_yes 0.05370866 2.194916e-02
## f.trip_type=trip_street_hail 0.04802499 4.617068e-02
## f.trip_type=trip_dispatch -0.04802499 4.617068e-02
## f.cost=(11,18] -0.29615023 4.189977e-02
## f.passenger_groups=passenger_couple -0.02813087 3.506270e-02
## f.improvement_surcharge=improvement_no -0.05370866 2.194916e-02
## qual.pickup=qual.pickup_07 -0.12700781 1.658808e-02
## f.hour=f.hour_19 -0.08432776 1.862563e-03
## hcpck=kHP-3 -0.04927854 5.381918e-04
## hcpck=kHP-2 -0.21815676 7.568311e-11
## claKM=kKM-4 -0.18649095 1.331950e-12
## qual.dropoff=qual.dropoff_09 -0.30121673 1.300743e-15
## qual.dropoff=qual.dropoff_11 -0.31237687 6.138443e-17
## qual.pickup=qual.pickup_09 -0.31977129 1.876752e-17
## f.passenger_groups=passenger_single -0.10841392 7.817837e-18
## qual.dropoff=qual.dropoff_08 -0.35384574 7.297022e-18
## qual.pickup=qual.pickup_16 -0.26527143 3.268857e-18
## qual.pickup=qual.pickup_08 -0.34711951 1.381176e-18
## qual.pickup=qual.pickup_10 -0.33714236 5.522121e-20
## qual.pickup=qual.pickup_11 -0.35928132 5.237051e-20
## qual.dropoff=qual.dropoff_10 -0.36053204 2.219965e-22
## qual.pickup=qual.pickup_13 -0.39890237 2.372084e-25
## qual.dropoff=qual.dropoff_16 -0.32642330 1.607292e-27
## qual.dropoff=qual.dropoff_13 -0.43026651 1.507793e-30
## f.trip_distance_range=trip_dist_medium -0.07313040 1.072903e-30
## f.dist=(1.6, 3] -0.31246907 9.446754e-32
## qual.dropoff=qual.dropoff_12 -0.47859237 9.426844e-36
## f.travel_time=[0,5] -0.29049636 9.666983e-38
## f.cost=(8,11] -0.43018853 7.542526e-38
## f.paid_tolls=tolls_no -0.49505825 7.540123e-40
## qual.dropoff=qual.dropoff_15 -0.44237277 4.881699e-40
## qual.pickup=qual.pickup_12 -0.49560743 6.579648e-41
## qual.pickup=qual.pickup_15 -0.44923414 1.845731e-42
## qual.dropoff=qual.dropoff_14 -0.49556182 4.306940e-50
## qual.pickup=qual.pickup_14 -0.48664875 1.576408e-50
## f.period=period morning -0.24398126 3.027662e-58
## f.payment_type=cash -0.08917443 1.835194e-63
## f.target.tip_is_given=tip_no -0.13011522 5.620747e-71
## f.travel_time=(5,10] -0.28876383 1.227157e-75
## f.espeed=[20,40) -0.06914010 3.321921e-98
## f.hour=f.hour_other -0.27946979 1.799468e-99
## f.cost=[0,8] -0.54898157 2.733800e-134
## f.dist=(0, 1.6] -0.38996534 1.642416e-198
## f.espeed=[03,20) -0.43939126 2.553516e-210
## f.period=period valley -0.31209462 1.081731e-261
## claKM=kKM-2 -0.77743263 0.000000e+00
## hcpck=kHP-1 -0.56668423 0.000000e+00
## f.extra=extra_no -0.27704085 0.000000e+00
## f.trip_distance_range=trip_dist_short -0.44238862 0.000000e+00
##
## attr(,"class")
## [1] "condes" "list "
In this dimension, since we have taken into account all the variables, we now have information for the quantitative variables. We see that, more or less, the most positively related are:
If we look at the categories, we see that the most related are,
res.hcpcMCA <- HCPC(res.mca,nb.clust=5,order=TRUE)
Note: If we chose the default number of cluster it would be 5, as we can guess from the inertia reduction plot, that follows the Elbow’s rule (number of black lines plus 1). In our case, after trying with bigger number of clusters, we decided that the default number of cluster was fine for our case and data.
Number of observations in each cluster:
table(res.hcpcMCA$data.clust$clust)
##
## 1 2 3 4 5
## 2702 1189 29 570 107
barplot(table(res.hcpcMCA$data.clust$clust), col="darkslateblue", border="darkslateblue", main="[hierarchical from mca] #observations/cluster")
res.hcpcMCA$desc.var$test.chi2
## p.value df
## f.rate_code_id 0.000000e+00 4
## f.payment_type 0.000000e+00 8
## f.trip_type 0.000000e+00 4
## f.trip_distance_range 0.000000e+00 8
## f.payment_type.1 0.000000e+00 8
## f.mta_tax 0.000000e+00 4
## f.improvement_surcharge 0.000000e+00 4
## f.dist 0.000000e+00 12
## f.cost 0.000000e+00 20
## f.travel_time 0.000000e+00 20
## f.espeed 1.774608e-227 8
## f.target.tip_is_given 1.412523e-59 4
## f.paid_tolls 1.638939e-37 4
## f.extra 2.649815e-27 4
## f.vendor_id 8.863949e-23 4
## f.period 1.935953e-05 12
## f.hour 2.353940e-02 24
We start wit the description of the categorical variables that characterize the clusters, so in this output we do not have dimensions because it is the total association. We can see the intensity of the variables, in our case the variables that affect more to the clustering are f.rate_code_id, f.payment_type, f.trip_type and f.period because are the one with the smallest p.value. The variables associated to the clusters are the ones that appear on the output.
Next, we want to see for each cluster which are the categories that characterize them. The clusters that contain more individuals are the first, the second and the fourth one. Clusters number 1 and 5 are the ones that have less individuals. We proceed to analyze them.
res.hcpcMCA$desc.var$category
## $`1`
## Cla/Mod Mod/Cla
## f.dist=(0, 1.6] 98.0169972 76.83197631
## f.trip_distance_range=trip_dist_short 90.6218487 99.77794226
## f.travel_time=(5,10] 93.6465917 52.36861584
## f.cost=[0,8] 96.9363708 45.66987417
## f.cost=(8,11] 95.7391304 40.74759437
## f.travel_time=[0,5] 94.0112994 30.79200592
## f.espeed=[03,20) 71.0467706 70.83641747
## f.payment_type.1=f.payment_type.1_cash 69.7627664 64.21169504
## f.payment_type=f.payment_type_cash 69.7627664 64.21169504
## f.target.tip_is_given=tip_no 66.6434298 70.76239822
## f.rate_code_id=rate_code_id_1 60.2632753 99.96299038
## f.trip_type=trip_street_hail 60.1513802 100.00000000
## f.mta_tax=mta_yes 60.2006689 99.92598075
## f.improvement_surcharge=improvement_yes 60.1426661 99.85196151
## f.paid_tolls=tolls_no 59.3194292 100.00000000
## f.extra=extra_yes 61.6498994 56.69874167
## f.hour=20 65.6765677 7.36491488
## f.period=period afternoon 61.8600683 26.83197631
## f.hour=18 64.3086817 7.40192450
## f.hour=other 57.0991848 62.21317543
## f.dist=(1.6, 3] 54.5454545 23.09400444
## f.period=period morning 51.4814815 10.28867506
## f.extra=extra_no 55.3977273 43.30125833
## f.payment_type.1=f.payment_type.1_no paid 0.0000000 0.00000000
## f.payment_type=f.payment_type_no paid 0.0000000 0.00000000
## f.cost=(50,129) 2.3255814 0.03700962
## f.paid_tolls=tolls_yes 0.0000000 0.00000000
## f.improvement_surcharge=improvement_no 3.6036036 0.14803849
## f.mta_tax=mta_no 1.7857143 0.07401925
## f.trip_type=trip_dispatch 0.0000000 0.00000000
## f.espeed=[20,40) 45.8699473 28.97853442
## f.rate_code_id=rate_code_id_other 0.8695652 0.03700962
## f.target.tip_is_given=tip_yes 45.7175926 29.23760178
## f.travel_time=(10,15] 37.6779847 12.73131014
## f.payment_type.1=f.payment_type.1_credit card 46.4680442 35.78830496
## f.payment_type=f.payment_type_credit card 46.4680442 35.78830496
## f.espeed=[40,55] 2.5510204 0.18504811
## f.cost=(30,50] 0.4566210 0.03700962
## f.cost=(11,18] 30.3030303 13.32346410
## f.travel_time=(15,20] 13.3211679 2.70170244
## f.dist=(5.5, 30] 0.3571429 0.07401925
## f.travel_time=(20,60] 3.3802817 0.88823094
## f.trip_distance_range=trip_dist_long 0.0000000 0.00000000
## f.cost=(18,30] 0.6906077 0.18504811
## f.dist=(3, 5.5] 0.0000000 0.00000000
## f.trip_distance_range=trip_dist_medium 0.6085193 0.22205774
## Global p.value
## f.dist=(0, 1.6] 46.0735262 0.000000e+00
## f.trip_distance_range=trip_dist_short 64.7161192 0.000000e+00
## f.travel_time=(5,10] 32.8692626 3.442605e-289
## f.cost=[0,8] 27.6919730 2.925544e-288
## f.cost=(8,11] 25.0163150 6.077316e-233
## f.travel_time=[0,5] 19.2516859 2.680563e-150
## f.espeed=[03,20) 58.6034370 2.416368e-90
## f.payment_type.1=f.payment_type.1_cash 54.1005003 5.654771e-61
## f.payment_type=f.payment_type_cash 54.1005003 5.654771e-61
## f.target.tip_is_given=tip_no 62.4102676 3.958843e-44
## f.rate_code_id=rate_code_id_1 97.4983685 1.197199e-43
## f.trip_type=trip_street_hail 97.7159017 6.822568e-42
## f.mta_tax=mta_yes 97.5636285 1.576666e-40
## f.improvement_surcharge=improvement_yes 97.5853818 8.923482e-37
## f.paid_tolls=tolls_no 99.0863607 5.229567e-17
## f.extra=extra_yes 54.0569937 1.795958e-05
## f.hour=20 6.5912552 1.108867e-02
## f.period=period afternoon 25.4948880 1.284590e-02
## f.hour=18 6.7652817 3.939460e-02
## f.hour=other 64.0417664 2.010820e-03
## f.dist=(1.6, 3] 24.8857951 8.245458e-04
## f.period=period morning 11.7467914 2.700040e-04
## f.extra=extra_no 45.9430063 1.795958e-05
## f.payment_type.1=f.payment_type.1_no paid 0.6308462 6.078909e-12
## f.payment_type=f.payment_type_no paid 0.6308462 6.078909e-12
## f.cost=(50,129) 0.9353926 1.376472e-15
## f.paid_tolls=tolls_yes 0.9136393 5.229567e-17
## f.improvement_surcharge=improvement_no 2.4146182 8.923482e-37
## f.mta_tax=mta_no 2.4363715 1.576666e-40
## f.trip_type=trip_dispatch 2.2840983 6.822568e-42
## f.espeed=[20,40) 37.1329128 2.513636e-42
## f.rate_code_id=rate_code_id_other 2.5016315 1.197199e-43
## f.target.tip_is_given=tip_yes 37.5897324 3.958843e-44
## f.travel_time=(10,15] 19.8607788 8.576670e-47
## f.payment_type.1=f.payment_type.1_credit card 45.2686535 7.517506e-54
## f.payment_type=f.payment_type_credit card 45.2686535 7.517506e-54
## f.espeed=[40,55] 4.2636502 1.719745e-68
## f.cost=(30,50] 4.7639765 7.282856e-86
## f.cost=(11,18] 25.8429410 2.183225e-118
## f.travel_time=(15,20] 11.9208179 3.738800e-122
## f.dist=(5.5, 30] 12.1818577 4.688965e-235
## f.travel_time=(20,60] 15.4448553 1.358713e-262
## f.trip_distance_range=trip_dist_long 13.8351099 5.048966e-278
## f.cost=(18,30] 15.7494018 9.860731e-309
## f.dist=(3, 5.5] 16.8588210 0.000000e+00
## f.trip_distance_range=trip_dist_medium 21.4487709 0.000000e+00
## v.test
## f.dist=(0, 1.6] Inf
## f.trip_distance_range=trip_dist_short Inf
## f.travel_time=(5,10] 36.342574
## f.cost=[0,8] 36.283691
## f.cost=(8,11] 32.587945
## f.travel_time=[0,5] 26.111764
## f.espeed=[03,20) 20.155331
## f.payment_type.1=f.payment_type.1_cash 16.473856
## f.payment_type=f.payment_type_cash 16.473856
## f.target.tip_is_given=tip_no 13.933601
## f.rate_code_id=rate_code_id_1 13.854360
## f.trip_type=trip_street_hail 13.560972
## f.mta_tax=mta_yes 13.328678
## f.improvement_surcharge=improvement_yes 12.667752
## f.paid_tolls=tolls_no 8.381414
## f.extra=extra_yes 4.288856
## f.hour=20 2.539892
## f.period=period afternoon 2.488013
## f.hour=18 2.060041
## f.hour=other -3.088630
## f.dist=(1.6, 3] -3.344421
## f.period=period morning -3.642499
## f.extra=extra_no -4.288856
## f.payment_type.1=f.payment_type.1_no paid -6.877788
## f.payment_type=f.payment_type_no paid -6.877788
## f.cost=(50,129) -7.987550
## f.paid_tolls=tolls_yes -8.381414
## f.improvement_surcharge=improvement_no -12.667752
## f.mta_tax=mta_no -13.328678
## f.trip_type=trip_dispatch -13.560972
## f.espeed=[20,40) -13.634014
## f.rate_code_id=rate_code_id_other -13.854360
## f.target.tip_is_given=tip_yes -13.933601
## f.travel_time=(10,15] -14.365032
## f.payment_type.1=f.payment_type.1_credit card -15.450230
## f.payment_type=f.payment_type_credit card -15.450230
## f.espeed=[40,55] -17.489595
## f.cost=(30,50] -19.638276
## f.cost=(11,18] -23.132460
## f.travel_time=(15,20] -23.503696
## f.dist=(5.5, 30] -32.736740
## f.travel_time=(20,60] -34.617908
## f.trip_distance_range=trip_dist_long -35.628633
## f.cost=(18,30] -37.559494
## f.dist=(3, 5.5] -Inf
## f.trip_distance_range=trip_dist_medium -Inf
##
## $`2`
## Cla/Mod Mod/Cla
## f.dist=(3, 5.5] 90.45161290 58.95710681
## f.trip_distance_range=trip_dist_medium 96.55172414 80.06728343
## f.cost=(11,18] 66.49831650 66.44238856
## f.travel_time=(15,20] 65.87591241 30.36164844
## f.travel_time=(10,15] 53.12157722 40.79058032
## f.cost=(18,30] 50.82872928 30.95037847
## f.dist=(1.6, 3] 42.56993007 40.95878890
## f.payment_type.1=f.payment_type.1_credit card 35.89620375 62.82590412
## f.payment_type=f.payment_type_credit card 35.89620375 62.82590412
## f.target.tip_is_given=tip_yes 36.57407407 53.15391085
## f.mta_tax=mta_yes 26.51059086 100.00000000
## f.travel_time=(20,60] 38.30985915 22.87636669
## f.trip_type=trip_street_hail 26.46927872 100.00000000
## f.rate_code_id=rate_code_id_1 26.50602410 99.91589571
## f.improvement_surcharge=improvement_yes 26.48238966 99.91589571
## f.espeed=[20,40) 30.28705331 43.48191758
## f.period=period morning 29.62962963 13.45668629
## f.hour=22 19.51219512 4.03700589
## f.payment_type.1=f.payment_type.1_no paid 0.00000000 0.00000000
## f.payment_type=f.payment_type_no paid 0.00000000 0.00000000
## f.espeed=[03,20) 23.45953972 53.15391085
## f.cost=(50,129) 0.00000000 0.00000000
## f.improvement_surcharge=improvement_no 0.90090090 0.08410429
## f.rate_code_id=rate_code_id_other 0.86956522 0.08410429
## f.trip_type=trip_dispatch 0.00000000 0.00000000
## f.mta_tax=mta_no 0.00000000 0.00000000
## f.cost=(30,50] 4.56621005 0.84104289
## f.target.tip_is_given=tip_no 19.41443012 46.84608915
## f.payment_type.1=f.payment_type.1_cash 17.77241657 37.17409588
## f.payment_type=f.payment_type_cash 17.77241657 37.17409588
## f.trip_distance_range=trip_dist_long 5.50314465 2.94365013
## f.dist=(5.5, 30] 0.00000000 0.00000000
## f.travel_time=[0,5] 0.45197740 0.33641716
## f.cost=(8,11] 1.82608696 1.76619008
## f.travel_time=(5,10] 4.10324289 5.21446594
## f.cost=[0,8] 0.00000000 0.00000000
## f.dist=(0, 1.6] 0.04721435 0.08410429
## f.trip_distance_range=trip_dist_short 6.78991597 16.98906644
## Global p.value
## f.dist=(3, 5.5] 16.8588210 0.000000e+00
## f.trip_distance_range=trip_dist_medium 21.4487709 0.000000e+00
## f.cost=(11,18] 25.8429410 7.181504e-280
## f.travel_time=(15,20] 11.9208179 1.071920e-99
## f.travel_time=(10,15] 19.8607788 9.723715e-89
## f.cost=(18,30] 15.7494018 2.025095e-56
## f.dist=(1.6, 3] 24.8857951 4.351112e-47
## f.payment_type.1=f.payment_type.1_credit card 45.2686535 2.506763e-45
## f.payment_type=f.payment_type_credit card 45.2686535 2.506763e-45
## f.target.tip_is_given=tip_yes 37.5897324 4.191162e-37
## f.mta_tax=mta_yes 97.5636285 1.713764e-15
## f.travel_time=(20,60] 15.4448553 1.728794e-15
## f.trip_type=trip_street_hail 97.7159017 1.477069e-14
## f.rate_code_id=rate_code_id_1 97.4983685 2.958780e-14
## f.improvement_surcharge=improvement_yes 97.5853818 9.797732e-14
## f.espeed=[20,40) 37.1329128 1.717237e-07
## f.period=period morning 11.7467914 3.544946e-02
## f.hour=22 5.3513161 1.697249e-02
## f.payment_type.1=f.payment_type.1_no paid 0.6308462 1.649226e-04
## f.payment_type=f.payment_type_no paid 0.6308462 1.649226e-04
## f.espeed=[03,20) 58.6034370 1.013368e-05
## f.cost=(50,129) 0.9353926 2.404906e-06
## f.improvement_surcharge=improvement_no 2.4146182 9.797732e-14
## f.rate_code_id=rate_code_id_other 2.5016315 2.958780e-14
## f.trip_type=trip_dispatch 2.2840983 1.477069e-14
## f.mta_tax=mta_no 2.4363715 1.713764e-15
## f.cost=(30,50] 4.7639765 1.881901e-17
## f.target.tip_is_given=tip_no 62.4102676 4.191162e-37
## f.payment_type.1=f.payment_type.1_cash 54.1005003 3.273197e-42
## f.payment_type=f.payment_type_cash 54.1005003 3.273197e-42
## f.trip_distance_range=trip_dist_long 13.8351099 3.087873e-46
## f.dist=(5.5, 30] 12.1818577 3.186844e-79
## f.travel_time=[0,5] 19.2516859 6.789458e-122
## f.cost=(8,11] 25.0163150 6.148203e-139
## f.travel_time=(5,10] 32.8692626 3.907702e-151
## f.cost=[0,8] 27.6919730 1.352628e-200
## f.dist=(0, 1.6] 46.0735262 0.000000e+00
## f.trip_distance_range=trip_dist_short 64.7161192 0.000000e+00
## v.test
## f.dist=(3, 5.5] Inf
## f.trip_distance_range=trip_dist_medium Inf
## f.cost=(11,18] 35.747706
## f.travel_time=(15,20] 21.194562
## f.travel_time=(10,15] 19.971625
## f.cost=(18,30] 15.827045
## f.dist=(1.6, 3] 14.411971
## f.payment_type.1=f.payment_type.1_credit card 14.129285
## f.payment_type=f.payment_type_credit card 14.129285
## f.target.tip_is_given=tip_yes 12.726907
## f.mta_tax=mta_yes 7.960478
## f.travel_time=(20,60] 7.959397
## f.trip_type=trip_street_hail 7.689504
## f.rate_code_id=rate_code_id_1 7.600110
## f.improvement_surcharge=improvement_yes 7.443601
## f.espeed=[20,40) 5.227601
## f.period=period morning 2.103187
## f.hour=22 -2.387303
## f.payment_type.1=f.payment_type.1_no paid -3.767451
## f.payment_type=f.payment_type_no paid -3.767451
## f.espeed=[03,20) -4.414301
## f.cost=(50,129) -4.716030
## f.improvement_surcharge=improvement_no -7.443601
## f.rate_code_id=rate_code_id_other -7.600110
## f.trip_type=trip_dispatch -7.689504
## f.mta_tax=mta_no -7.960478
## f.cost=(30,50] -8.500861
## f.target.tip_is_given=tip_no -12.726907
## f.payment_type.1=f.payment_type.1_cash -13.614738
## f.payment_type=f.payment_type_cash -13.614738
## f.trip_distance_range=trip_dist_long -14.276009
## f.dist=(5.5, 30] -18.845706
## f.travel_time=[0,5] -23.478345
## f.cost=(8,11] -25.091563
## f.travel_time=(5,10] -26.185300
## f.cost=[0,8] -30.218525
## f.dist=(0, 1.6] -Inf
## f.trip_distance_range=trip_dist_short -Inf
##
## $`3`
## Cla/Mod Mod/Cla Global
## f.payment_type.1=f.payment_type.1_no paid 100.000000 100.00000 0.6308462
## f.payment_type=f.payment_type_no paid 100.000000 100.00000 0.6308462
## f.vendor_id=vendor_id_mobile 2.998966 100.00000 21.0354579
## f.target.tip_is_given=tip_no 1.010805 100.00000 62.4102676
## f.travel_time=[0,5] 1.355932 41.37931 19.2516859
## f.period=period morning 1.481481 27.58621 11.7467914
## f.target.tip_is_given=tip_yes 0.000000 0.00000 37.5897324
## f.payment_type.1=f.payment_type.1_credit card 0.000000 0.00000 45.2686535
## f.payment_type=f.payment_type_credit card 0.000000 0.00000 45.2686535
## f.payment_type.1=f.payment_type.1_cash 0.000000 0.00000 54.1005003
## f.payment_type=f.payment_type_cash 0.000000 0.00000 54.1005003
## f.vendor_id=vendor_id_verifone 0.000000 0.00000 78.9645421
## p.value v.test
## f.payment_type.1=f.payment_type.1_no paid 5.932175e-76 18.443030
## f.payment_type=f.payment_type_no paid 5.932175e-76 18.443030
## f.vendor_id=vendor_id_mobile 1.659143e-20 9.282264
## f.target.tip_is_given=tip_no 1.094456e-06 4.873847
## f.travel_time=[0,5] 6.356658e-03 2.728793
## f.period=period morning 2.024023e-02 2.321865
## f.target.tip_is_given=tip_yes 1.094456e-06 -4.873847
## f.payment_type.1=f.payment_type.1_credit card 2.381972e-08 -5.581687
## f.payment_type=f.payment_type_credit card 2.381972e-08 -5.581687
## f.payment_type.1=f.payment_type.1_cash 1.402561e-10 -6.415614
## f.payment_type=f.payment_type_cash 1.402561e-10 -6.415614
## f.vendor_id=vendor_id_verifone 1.659143e-20 -9.282264
##
## $`4`
## Cla/Mod Mod/Cla
## f.dist=(5.5, 30] 94.82142857 93.1578947
## f.trip_distance_range=trip_dist_long 89.46540881 99.8245614
## f.travel_time=(20,60] 55.07042254 68.5964912
## f.cost=(30,50] 91.78082192 35.2631579
## f.cost=(18,30] 44.61325967 56.6666667
## f.espeed=[40,55] 72.44897959 24.9122807
## f.cost=(50,129) 90.69767442 6.8421053
## f.espeed=[20,40) 19.44932630 58.2456140
## f.paid_tolls=tolls_yes 78.57142857 5.7894737
## f.target.tip_is_given=tip_yes 16.66666667 50.5263158
## f.payment_type.1=f.payment_type.1_credit card 15.90581451 58.0701754
## f.payment_type=f.payment_type_credit card 15.90581451 58.0701754
## f.improvement_surcharge=improvement_yes 12.70619706 100.0000000
## f.trip_type=trip_street_hail 12.68922529 100.0000000
## f.travel_time=(15,20] 17.51824818 16.8421053
## f.mta_tax=mta_yes 12.64214047 99.4736842
## f.hour=other 13.68885870 70.7017544
## f.travel_time=f.travel_time.NA 36.66666667 1.9298246
## f.period=period night 14.46965052 41.4035088
## f.rate_code_id=rate_code_id_1 12.56135654 98.7719298
## f.rate_code_id=rate_code_id_other 6.08695652 1.2280702
## f.payment_type.1=f.payment_type.1_no paid 0.00000000 0.0000000
## f.payment_type=f.payment_type_no paid 0.00000000 0.0000000
## f.hour=18 8.36012862 4.5614035
## f.hour=20 8.25082508 4.3859649
## f.mta_tax=mta_no 2.67857143 0.5263158
## f.period=period afternoon 8.78839590 18.0701754
## f.trip_type=trip_dispatch 0.00000000 0.0000000
## f.improvement_surcharge=improvement_no 0.00000000 0.0000000
## f.travel_time=(10,15] 7.33844469 11.7543860
## f.payment_type.1=f.payment_type.1_cash 9.60997185 41.9298246
## f.payment_type=f.payment_type_cash 9.60997185 41.9298246
## f.target.tip_is_given=tip_no 9.82920878 49.4736842
## f.dist=(3, 5.5] 5.03225806 6.8421053
## f.paid_tolls=tolls_no 11.78924259 94.2105263
## f.travel_time=[0,5] 0.11299435 0.1754386
## f.trip_distance_range=trip_dist_medium 0.00000000 0.0000000
## f.cost=(11,18] 0.58922559 1.2280702
## f.dist=(1.6, 3] 0.00000000 0.0000000
## f.cost=(8,11] 0.00000000 0.0000000
## f.cost=[0,8] 0.00000000 0.0000000
## f.travel_time=(5,10] 0.26472535 0.7017544
## f.espeed=[03,20) 3.56347439 16.8421053
## f.dist=(0, 1.6] 0.00000000 0.0000000
## f.trip_distance_range=trip_dist_short 0.03361345 0.1754386
## Global p.value
## f.dist=(5.5, 30] 12.1818577 0.000000e+00
## f.trip_distance_range=trip_dist_long 13.8351099 0.000000e+00
## f.travel_time=(20,60] 15.4448553 6.364837e-223
## f.cost=(30,50] 4.7639765 3.289169e-173
## f.cost=(18,30] 15.7494018 3.743313e-135
## f.espeed=[40,55] 4.2636502 2.697019e-90
## f.cost=(50,129) 0.9353926 1.053413e-31
## f.espeed=[20,40) 37.1329128 8.093424e-28
## f.paid_tolls=tolls_yes 0.9136393 8.168114e-23
## f.target.tip_is_given=tip_yes 37.5897324 1.891131e-11
## f.payment_type.1=f.payment_type.1_credit card 45.2686535 6.145092e-11
## f.payment_type=f.payment_type_credit card 45.2686535 6.145092e-11
## f.improvement_surcharge=improvement_yes 97.5853818 3.429456e-07
## f.trip_type=trip_street_hail 97.7159017 7.745137e-07
## f.travel_time=(15,20] 11.9208179 2.060266e-04
## f.mta_tax=mta_yes 97.5636285 2.982072e-04
## f.hour=other 64.0417664 3.429397e-04
## f.travel_time=f.travel_time.NA 0.6525995 6.941860e-04
## f.period=period night 35.4796606 1.757577e-03
## f.rate_code_id=rate_code_id_1 97.4983685 2.754068e-02
## f.rate_code_id=rate_code_id_other 2.5016315 2.754068e-02
## f.payment_type.1=f.payment_type.1_no paid 0.6308462 2.124484e-02
## f.payment_type=f.payment_type_no paid 0.6308462 2.124484e-02
## f.hour=18 6.7652817 2.032736e-02
## f.hour=20 6.5912552 1.859877e-02
## f.mta_tax=mta_no 2.4363715 2.982072e-04
## f.period=period afternoon 25.4948880 7.687879e-06
## f.trip_type=trip_dispatch 2.2840983 7.745137e-07
## f.improvement_surcharge=improvement_no 2.4146182 3.429456e-07
## f.travel_time=(10,15] 19.8607788 5.015019e-08
## f.payment_type.1=f.payment_type.1_cash 54.1005003 5.114647e-10
## f.payment_type=f.payment_type_cash 54.1005003 5.114647e-10
## f.target.tip_is_given=tip_no 62.4102676 1.891131e-11
## f.dist=(3, 5.5] 16.8588210 8.690161e-14
## f.paid_tolls=tolls_no 99.0863607 8.168114e-23
## f.travel_time=[0,5] 19.2516859 1.765999e-55
## f.trip_distance_range=trip_dist_medium 21.4487709 4.113524e-65
## f.cost=(11,18] 25.8429410 1.008505e-67
## f.dist=(1.6, 3] 24.8857951 3.442217e-77
## f.cost=(8,11] 25.0163150 1.165073e-77
## f.cost=[0,8] 27.6919730 1.653348e-87
## f.travel_time=(5,10] 32.8692626 5.356202e-99
## f.espeed=[03,20) 58.6034370 2.368699e-106
## f.dist=(0, 1.6] 46.0735262 1.764846e-168
## f.trip_distance_range=trip_dist_short 64.7161192 4.790507e-289
## v.test
## f.dist=(5.5, 30] Inf
## f.trip_distance_range=trip_dist_long Inf
## f.travel_time=(20,60] 31.872783
## f.cost=(30,50] 28.056917
## f.cost=(18,30] 24.742396
## f.espeed=[40,55] 20.149892
## f.cost=(50,129) 11.716159
## f.espeed=[20,40) 10.932121
## f.paid_tolls=tolls_yes 9.832361
## f.target.tip_is_given=tip_yes 6.714190
## f.payment_type.1=f.payment_type.1_credit card 6.540165
## f.payment_type=f.payment_type_credit card 6.540165
## f.improvement_surcharge=improvement_yes 5.098174
## f.trip_type=trip_street_hail 4.941682
## f.travel_time=(15,20] 3.711510
## f.mta_tax=mta_yes 3.616852
## f.hour=other 3.580498
## f.travel_time=f.travel_time.NA 3.391865
## f.period=period night 3.128405
## f.rate_code_id=rate_code_id_1 2.203768
## f.rate_code_id=rate_code_id_other -2.203768
## f.payment_type.1=f.payment_type.1_no paid -2.303605
## f.payment_type=f.payment_type_no paid -2.303605
## f.hour=18 -2.320250
## f.hour=20 -2.353477
## f.mta_tax=mta_no -3.616852
## f.period=period afternoon -4.473697
## f.trip_type=trip_dispatch -4.941682
## f.improvement_surcharge=improvement_no -5.098174
## f.travel_time=(10,15] -5.450777
## f.payment_type.1=f.payment_type.1_cash -6.215546
## f.payment_type=f.payment_type_cash -6.215546
## f.target.tip_is_given=tip_no -6.714190
## f.dist=(3, 5.5] -7.459424
## f.paid_tolls=tolls_no -9.832361
## f.travel_time=[0,5] -15.690164
## f.trip_distance_range=trip_dist_medium -17.040478
## f.cost=(11,18] -17.388493
## f.dist=(1.6, 3] -18.596307
## f.cost=(8,11] -18.654305
## f.cost=[0,8] -19.829604
## f.travel_time=(5,10] -21.118688
## f.espeed=[03,20) -21.904207
## f.dist=(0, 1.6] -27.666547
## f.trip_distance_range=trip_dist_short -36.333488
##
## $`5`
## Cla/Mod Mod/Cla
## f.trip_type=trip_dispatch 100.00000000 98.1308411
## f.mta_tax=mta_no 95.53571429 100.0000000
## f.improvement_surcharge=improvement_no 94.59459459 98.1308411
## f.rate_code_id=rate_code_id_other 92.17391304 99.0654206
## f.extra=extra_no 5.01893939 99.0654206
## f.target.tip_is_given=tip_no 3.10212618 83.1775701
## f.espeed=[20,40) 3.63210310 57.9439252
## f.travel_time=[0,5] 4.06779661 33.6448598
## f.trip_distance_range=trip_dist_long 4.08805031 24.2990654
## f.dist=(3, 5.5] 3.74193548 27.1028037
## f.payment_type.1=f.payment_type.1_cash 2.85484520 66.3551402
## f.payment_type=f.payment_type_cash 2.85484520 66.3551402
## f.dist=(5.5, 30] 3.92857143 20.5607477
## f.travel_time=(5,10] 1.58835208 22.4299065
## f.payment_type.1=f.payment_type.1_credit card 1.72993753 33.6448598
## f.payment_type=f.payment_type_credit card 1.72993753 33.6448598
## f.dist=(0, 1.6] 1.32200189 26.1682243
## f.target.tip_is_given=tip_yes 1.04166667 16.8224299
## f.espeed=[03,20) 1.37342242 34.5794393
## f.extra=extra_yes 0.04024145 0.9345794
## f.rate_code_id=rate_code_id_1 0.02231147 0.9345794
## f.improvement_surcharge=improvement_yes 0.04458315 1.8691589
## f.mta_tax=mta_yes 0.00000000 0.0000000
## f.trip_type=trip_street_hail 0.04452360 1.8691589
## Global p.value
## f.trip_type=trip_dispatch 2.284098 5.592430e-213
## f.mta_tax=mta_no 2.436372 7.437902e-212
## f.improvement_surcharge=improvement_no 2.414618 1.262945e-203
## f.rate_code_id=rate_code_id_other 2.501631 1.747518e-203
## f.extra=extra_no 45.943006 2.188043e-35
## f.target.tip_is_given=tip_no 62.410268 2.397273e-06
## f.espeed=[20,40) 37.132913 1.099810e-05
## f.travel_time=[0,5] 19.251686 3.747646e-04
## f.trip_distance_range=trip_dist_long 13.835110 3.388104e-03
## f.dist=(3, 5.5] 16.858821 7.102324e-03
## f.payment_type.1=f.payment_type.1_cash 54.100500 9.677540e-03
## f.payment_type=f.payment_type_cash 54.100500 9.677540e-03
## f.dist=(5.5, 30] 12.181858 1.273706e-02
## f.travel_time=(5,10] 32.869263 1.753725e-02
## f.payment_type.1=f.payment_type.1_credit card 45.268653 1.402855e-02
## f.payment_type=f.payment_type_credit card 45.268653 1.402855e-02
## f.dist=(0, 1.6] 46.073526 2.070698e-05
## f.target.tip_is_given=tip_yes 37.589732 2.397273e-06
## f.espeed=[03,20) 58.603437 4.627727e-07
## f.extra=extra_yes 54.056994 2.188043e-35
## f.rate_code_id=rate_code_id_1 97.498369 1.747518e-203
## f.improvement_surcharge=improvement_yes 97.585382 1.262945e-203
## f.mta_tax=mta_yes 97.563628 7.437902e-212
## f.trip_type=trip_street_hail 97.715902 5.592430e-213
## v.test
## f.trip_type=trip_dispatch 31.146868
## f.mta_tax=mta_no 31.063760
## f.improvement_surcharge=improvement_no 30.448265
## f.rate_code_id=rate_code_id_other 30.437609
## f.extra=extra_no 12.414222
## f.target.tip_is_given=tip_no 4.716677
## f.espeed=[20,40) 4.396558
## f.travel_time=[0,5] 3.557245
## f.trip_distance_range=trip_dist_long 2.930139
## f.dist=(3, 5.5] 2.692008
## f.payment_type.1=f.payment_type.1_cash 2.587143
## f.payment_type=f.payment_type_cash 2.587143
## f.dist=(5.5, 30] 2.491038
## f.travel_time=(5,10] -2.375246
## f.payment_type.1=f.payment_type.1_credit card -2.456531
## f.payment_type=f.payment_type_credit card -2.456531
## f.dist=(0, 1.6] -4.257128
## f.target.tip_is_given=tip_yes -4.716677
## f.espeed=[03,20) -5.041138
## f.extra=extra_yes -12.414222
## f.rate_code_id=rate_code_id_1 -30.437609
## f.improvement_surcharge=improvement_yes -30.448265
## f.mta_tax=mta_yes -31.063760
## f.trip_type=trip_street_hail -31.146868
!!! FALTA DESCRIPCIÓ CLUSTERS
We now proceed to see the quantitative variables that characterizes the clusters.
res.hcpcMCA$desc.var$quanti.var
## Eta2 P-value
## q.mta_tax 0.954400361 0.000000e+00
## q.improvement_surcharge 0.926870495 0.000000e+00
## q.target.total_amount 0.696374175 0.000000e+00
## q.tlenkm 0.733105763 0.000000e+00
## q.extra 0.024246452 1.898067e-23
## q.hour 0.005594674 3.523440e-05
We can see in the output that the variable that appears is slightly over represented in the clusters. We can notice that q.target.total_amount is over represented with 0.7 units over the global mean. So it is practically the same as the global mean.
We want to know now which variables are associated with the quantitative variables.
res.hcpcMCA$desc.var$quanti
## $`1`
## v.test Mean in category Overall mean sd in category
## q.mta_tax 12.403864 0.4996299 0.4878181 0.01359820
## q.improvement_surcharge 11.953152 0.2995559 0.2927561 0.01153418
## q.extra 4.366047 0.3725019 0.3527300 0.36946101
## q.hour 3.731183 13.7102147 13.3976506 6.75082850
## q.tlenkm -45.387774 1.9826565 4.2960404 0.83433807
## q.target.total_amount -46.567067 8.5021466 13.6361105 2.57805277
## Overall sd p.value
## q.mta_tax 0.07708781 2.490182e-35
## q.improvement_surcharge 0.04605087 6.250833e-33
## q.extra 0.36659461 1.265153e-05
## q.hour 6.78141734 1.905825e-04
## q.tlenkm 4.12607282 0.000000e+00
## q.target.total_amount 8.92487225 0.000000e+00
##
## $`2`
## v.test Mean in category Overall mean sd in category
## q.target.total_amount 14.675258 16.9069302 13.6361105 3.989985582
## q.tlenkm 10.426291 5.3703652 4.2960404 1.457225793
## q.mta_tax 6.327890 0.5000000 0.4878181 0.000000000
## q.improvement_surcharge 6.079477 0.2997477 0.2927561 0.008696562
## Overall sd p.value
## q.target.total_amount 8.92487225 9.286196e-49
## q.tlenkm 4.12607282 1.880866e-25
## q.mta_tax 0.07708781 2.485360e-10
## q.improvement_surcharge 0.04605087 1.205751e-09
##
## $`3`
## NULL
##
## $`4`
## v.test Mean in category Overall mean sd in category
## q.tlenkm 52.709431 12.8228898 4.2960404 4.60411970
## q.target.total_amount 49.487922 30.9527895 13.6361105 9.87342462
## q.improvement_surcharge 4.012070 0.3000000 0.2927561 0.00000000
## q.mta_tax 3.159853 0.4973684 0.4878181 0.03617823
## q.extra -2.631155 0.3149123 0.3527300 0.33827023
## q.hour -3.758873 12.3982456 13.3976506 7.00107739
## Overall sd p.value
## q.tlenkm 4.12607282 0.0000000000
## q.target.total_amount 8.92487225 0.0000000000
## q.improvement_surcharge 0.04605087 0.0000601887
## q.mta_tax 0.07708781 0.0015784853
## q.extra 0.36659461 0.0085095340
## q.hour 6.78141734 0.0001706803
##
## $`5`
## v.test Mean in category Overall mean sd in category
## q.hour -2.488662 11.785046729 13.3976506 6.76166771
## q.extra -9.936266 0.004672897 0.3527300 0.04811042
## q.improvement_surcharge -65.257084 0.005607477 0.2927561 0.04063003
## q.mta_tax -66.226368 0.000000000 0.4878181 0.00000000
## Overall sd p.value
## q.hour 6.78141734 1.282247e-02
## q.extra 0.36659461 2.894758e-23
## q.improvement_surcharge 0.04605087 0.000000e+00
## q.mta_tax 0.07708781 0.000000e+00
We can notice that every cluster has remarked the q.target.total_amount variable except the first one, that does not have any variable to be described.
!! FALTA DESCRIPCIÓ CLUSTERS
We are going to evaluate the partition quality.
## ( between sum of squares / total sum of squares ) * 100
((res.hcpcMCA$call$t$within[1]-res.hcpcMCA$call$t$within[5])/res.hcpcMCA$call$t$within[1])*100
## [1] 73.94977
The quality of this reduction if of 73.95%
In case we wanted to achieve an 80% of the clustering representativity we would need 6 clusters.
((res.hcpcMCA$call$t$within[1]-res.hcpcMCA$call$t$within[6])/res.hcpcMCA$call$t$within[1])*100
## [1] 81.42932
res.hcpcMCA$desc.ind$para ## representative individuals of each cluster
## Cluster: 1
## 1403696 330396 450361 119458 199108
## 0.06596882 0.07124413 0.07124413 0.08637699 0.08637699
## ------------------------------------------------------------
## Cluster: 2
## 29575 389704 389771 711068 774406
## 0.08761136 0.08761136 0.08761136 0.08761136 0.08761136
## ------------------------------------------------------------
## Cluster: 3
## 437922 825427 1150441 572644 221913
## 0.3499080 0.3512516 0.3572850 0.3631260 0.4026534
## ------------------------------------------------------------
## Cluster: 4
## 975120 1112715 1228516 1026671 1155586
## 0.1238459 0.1238459 0.1243729 0.1266106 0.1266106
## ------------------------------------------------------------
## Cluster: 5
## 779907 1272045 485688 27974 305082
## 0.1433246 0.1433246 0.1780173 0.2301847 0.2339695
What we obtain are the more representative individuals, paragons, for each cluster. We get the rownames of each paragon in every single cluster.
res.hcpcMCA$desc.ind$dist ## individuals distant from each cluster
## Cluster: 1
## 311984 528281 1262358 967890 646551
## 1.776258 1.701414 1.508609 1.341479 1.293639
## ------------------------------------------------------------
## Cluster: 2
## 624370 231280 695112 986535 423835
## 1.440229 1.372352 1.372352 1.372352 1.370701
## ------------------------------------------------------------
## Cluster: 3
## 453619 881540 274645 166154 128613
## 4.790505 4.688895 4.685513 4.676717 4.674723
## ------------------------------------------------------------
## Cluster: 4
## 826623 121215 1342604 194151 244971
## 2.380086 2.362877 2.237584 2.131013 2.086446
## ------------------------------------------------------------
## Cluster: 5
## 1083301 636795 1037672 131263 605405
## 3.480819 3.466755 3.460463 3.452474 3.444979
What we obtain are those individuals of each cluster that that far away in the same cluster from the rest of the individuals. We also obtain the rownames of each individual with the bigger distance respect the other ones in the cluster.
We get the grpahical representation for the individuals that characterize classes (para and dist).
## characteristic individuals
para1<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$para[[1]]))
dist1<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$dist[[1]]))
para2<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$para[[2]]))
dist2<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$dist[[2]]))
para3<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$para[[3]]))
dist3<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$dist[[3]]))
para4<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$para[[4]]))
dist4<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$dist[[4]]))
para5<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$para[[5]]))
dist5<-which(rownames(res.mca$ind$coord)%in%names(res.hcpcMCA$desc.ind$dist[[5]]))
plot(res.mca$ind$coord[,1],res.mca$ind$coord[,2],col="grey50",cex=0.5,pch=16)
points(res.mca$ind$coord[para1,1],res.mca$ind$coord[para1,2],col="blue",cex=1,pch=16)
points(res.mca$ind$coord[dist1,1],res.mca$ind$coord[dist1,2],col="chartreuse3",cex=1,pch=16)
points(res.mca$ind$coord[para2,1],res.mca$ind$coord[para2,2],col="blue",cex=1,pch=16)
points(res.mca$ind$coord[dist2,1],res.mca$ind$coord[dist2,2],col="darkorchid3",cex=1,pch=16)
points(res.mca$ind$coord[para3,1],res.mca$ind$coord[para3,2],col="blue",cex=1,pch=16)
points(res.mca$ind$coord[dist3,1],res.mca$ind$coord[dist3,2],col="firebrick3",cex=1,pch=16)
points(res.mca$ind$coord[para4,1],res.mca$ind$coord[para4,2],col="blue",cex=1,pch=16)
points(res.mca$ind$coord[dist4,1],res.mca$ind$coord[dist4,2],col="palevioletred3",cex=1,pch=16)
points(res.mca$ind$coord[para5,1],res.mca$ind$coord[para5,2],col="blue",cex=1,pch=16)
points(res.mca$ind$coord[dist5,1],res.mca$ind$coord[dist5,2],col="royalblue1",cex=1,pch=16)
df$hcpckMCA<-res.hcpcMCA$data.clust$clust
## With Hierarchical Clustering (PCA)
table(df$hcpck,df$hcpckMCA)
##
## 1 2 3 4 5
## kHP-1 2465 846 23 2 71
## kHP-2 221 79 1 27 3
## kHP-3 14 3 0 8 0
## kHP-4 2 255 5 504 33
## kHP-5 0 6 0 29 0
df$hcpckMCA_hcpck<-factor(
df$hcpckMCA,
levels=c(1,2,5,4,3),
labels=c("kHPmca-1","kHPmca-2","kHPmca-5","kHPmca-4","kHPmca-3")
); tt1<-table(df$hcpck,df$hcpckMCA_hcpck); tt1; 100*sum(diag(tt1)/sum(tt1))
##
## kHPmca-1 kHPmca-2 kHPmca-5 kHPmca-4 kHPmca-3
## kHP-1 2465 846 71 2 23
## kHP-2 221 79 3 27 1
## kHP-3 14 3 0 8 0
## kHP-4 2 255 33 504 5
## kHP-5 0 6 0 29 0
## [1] 66.30411
We have a concordance of the 66.30% so we can say that they are different, if we had a greater concordance, this would mean that they would be more similar.
## With k-means (PCA)
table(df$claKM, df$hcpckMCA)
##
## 1 2 3 4 5
## kKM-2 1049 323 11 0 62
## kKM-4 1506 439 12 0 6
## kKM-5 14 2 0 15 0
## kKM-3 86 391 5 325 29
## kKM-1 47 34 1 230 10
df$hcpckMCA_claKM<-factor(
df$hcpckMCA,
levels=c(2,1,5,3,4),
labels=c("kHPmca-2","kHPmca-1","kHPmca-5","kHPmca-3","kHPmca-4")
);tt2<-table(df$claKM,df$hcpckMCA_claKM); tt2;100*sum(diag(tt2)/sum(tt2))
##
## kHPmca-2 kHPmca-1 kHPmca-5 kHPmca-3 kHPmca-4
## kKM-2 323 1049 62 11 0
## kKM-4 439 1506 6 12 0
## kKM-5 2 14 0 0 15
## kKM-3 391 86 29 5 325
## kKM-1 34 47 10 1 230
## [1] 44.89885
We have a concordance of the 44.90% so we can say that they are different, if we had a greater concordance, this would mean that they would be more similar.
## res.hcpc$desc.var$quanti.var ## quantitative variables which characterizes the clusters
## ## Eta2 P-value
## ## q.target.total_amount 0.572312116 0.000000e+00
## res.cat <-catdes(df,29)
## res.cat
## ## Link between the cluster variable and the quantitative variables
## ## ================================================================
## ## Eta2 P-value
## ## q.target.total_amount 0.065791515 5.688596e-70
## ## res.hcpcMCA$desc.var$quanti.var
## ## Eta2 P-value
## ## q.target.total_amount 0.696374175 0.000000e+00
To compare the variable Total_amount in the three different classifications, we will look at Eta2:
## res.hcpc$desc.var$category ## description of each cluster by the categories
# # $`1`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_no 80.0278843 67.3906663 62.410268
# # f.target.tip_is_given=tip_yes 64.2939815 32.6093337 37.589732
# #
# # $`2`
# # Cla/Mod Mod/Cla Global
# # nothing to see here
# #
# # $`3`
# # Cla/Mod Mod/Cla Global p.value
# # f.target.tip_is_given=tip_no 0.7319624 84 62.41027 0.021898539
# # f.target.tip_is_given=tip_yes 0.2314815 16 37.58973 0.021898539
# #
# # $`4`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_yes 26.909722 58.197747 37.589732
# # f.target.tip_is_given=tip_no 11.641687 41.802253 62.410268
# #
# # $`5`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_yes 1.3888889 68.57143 37.58973
# # f.target.tip_is_given=tip_no 0.3834089 31.42857 62.41027
## wtf
## res.hcpcMCA$desc.var$category ## description of each cluster by the categories
# # $`1`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_no 66.6434298 70.76239822 62.4102676
# # f.target.tip_is_given=tip_yes 45.7175926 29.23760178 37.5897324
# #
# # $`2`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_yes 36.57407407 53.15391085 37.5897324
# # f.target.tip_is_given=tip_no 19.41443012 46.84608915 62.4102676
# #
# # $`3`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_no 1.010805 100.00000 62.4102676
# # f.target.tip_is_given=tip_yes 0.000000 0.00000 37.5897324
# #
# # $`4`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_yes 16.66666667 50.5263158 37.5897324
# # f.target.tip_is_given=tip_no 9.82920878 49.4736842 62.4102676
# #
# # $`5`
# # Cla/Mod Mod/Cla Global
# # f.target.tip_is_given=tip_no 3.10212618 83.1775701 62.410268
# # f.target.tip_is_given=tip_yes 1.04166667 16.8224299 37.589732
FALTA
We think that at first glance, we do not find the relationship between the different clusters of the different types of analysis. As we can see in the data, they are not distributed in the same way with respect to the two variables we had to analyze.
It makes sense to think this, since these variables have not been taken into account in the analyzes, as they had the role of supplementary variables, which means that they only served us as explanatory variables, and not to decide how to form clusters.
correlated with the previous –>
same bic –>
Comment
FALTA
To compare the variable TipIsGiven in the three different classifications, we will look at Cla / Mod, Mod / Cla and Global: